In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from scipy.stats import pointbiserialr
from scipy.stats import chi2_contingency
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

import pickle

In [56]:
def hourPreProcessing(df, columnName):
    for i in range(len(df)):
        oldValue = df.loc[i, columnName]
        if ':' in oldValue:
            hour,minute = oldValue.split(':')
            hour=float(hour)*60
            minute = float(minute)
            newValue = hour+minute
            newValue /=60
        else:
            newValue = float(oldValue)
        df.loc[i, columnName] = newValue


In [57]:
def columnDropper(df, columnName):
    df.drop(columnName, axis=1, inplace=True)


In [58]:
def ColumnSeparator(df, columnName):
    for i in range(len(df)):
        value = df.loc[i, columnName]
        items = value.split(', ')
        for item in items:
            key, val = item.split(': ')
            key=key.strip("{'")
            val=val.strip("'}")
            df.loc[i, key] = val
    df.drop(columns=[columnName], inplace=True)

In [59]:
def NumPreprocessing(data, df):
    # Create a HealthyLifestyleScore and drop original components
    df = df.copy()
    print(df.columns)
    df['HealthyLifestyleScore'] = (
        df['DietQuality'] + df['SleepQuality'] + pd.to_numeric(df['WeeklyPhysicalActivity (hr)'])
    ) / 3
    df.drop(['WeeklyPhysicalActivity (hr)', 'DietQuality', 'SleepQuality'], axis=1, inplace=True)

        #print(NumData)
    results = []
    for feature in df.columns:
        correlation, p_value = pointbiserialr(data['Diagnosis'], df[feature])
        results.append({
            'Feature': feature, 
            'Correlation': correlation,
            'p_value': p_value
        })
    results_df = pd.DataFrame(results)
    top_10_significant = results_df.sort_values(by='p_value').head(10)
    selected_columns = top_10_significant['Feature'].tolist()  
    #print(NumData[selected_columns])
    df =df[selected_columns]

    train_mean = df.mean()
    with open('meanClass.pkl', 'wb') as f:
        pickle.dump(train_mean, f)

    # Scale numeric data
    scaler = MinMaxScaler()
    

    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    df_scaled = pd.DataFrame(df_scaled) 

    # Save the scaler for future use
    with open('scalerClass.pkl', 'wb') as f:
        pickle.dump(scaler, f)

    return df_scaled



In [60]:

def encodingCategorical(data, df):
    label_encoders = {}

    # Ensure df is a copy to avoid SettingWithCopyWarning
    df = df.copy()

    for column in df.columns:
        le = LabelEncoder()

        # Add 'Others' category to the unique values from training data
        all_values = list(df[column].unique()) + ['Z']


        # print(list(df['Depression'].unique()))
        # Fit the encoder once with known values
        le.fit(all_values)

        # Apply encoding on the actual column from the original dataset
        df.loc[:, column] = le.transform(data[column])

        # Save the encoder for future use (e.g., test time)
        label_encoders[column] = le

    # Save all encoders to a file
    with open('label_encodersClass.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)

    return df



In [61]:
def CategoricalPreprocessing(data, df):
    results = []
    
    for feature in df:
    
        crosstab = pd.crosstab(df[feature], data['Diagnosis'])
        chi2, p, dof, expected = chi2_contingency(crosstab)
        results.append({
            'Feature': feature,
            'Chi2_statistic': chi2,
            'p_value': p
        })
    
    chi2_df = pd.DataFrame(results)
    
    chi2_df_sorted = chi2_df.sort_values(by='p_value').head(10)
    selected_columns =chi2_df_sorted['Feature'].tolist()  
    df =df[selected_columns]
    # print(df)
    
    train_mode = df.mode()
    with open('modeClass.pkl', 'wb') as f:
        pickle.dump(train_mode, f)

    encoded_df = encodingCategorical(data, df)

    # encoded_df['Disease Symptoms'] =  (encoded_df['Tremor'] + encoded_df['Bradykinesia'] + encoded_df['SleepDisorders'] + encoded_df['PosturalInstability']) / 4
    # encoded_df['ChronicDiseasesScore'] = (encoded_df['Hypertension'] + encoded_df['Diabetes']) / 2
    # #
    # encoded_df.drop(['Tremor', 'Bradykinesia','SleepDisorders', 'PosturalInstability','Hypertension', 'Diabetes'], axis=1, inplace=True)
    return encoded_df


In [62]:
def PreProcessing():
    data= pd.read_csv("parkinsons_disease_data_cls.csv")
    
    # print(data.head())
    # print(data.describe())
    # print(data.shape)
    # print(data.isnull().sum())

    mode = data['EducationLevel'].mode()[0]
    data['EducationLevel'] = data['EducationLevel'].fillna(mode)
    # print("Duplicated values:")
    # print(data.duplicated().sum())

    hourPreProcessing(data, 'WeeklyPhysicalActivity (hr)')
    columnDropper(data, 'PatientID')
    columnDropper(data, 'DoctorInCharge')
    NumData = data.drop( columns=['Gender','Smoking','EducationLevel','Ethnicity','Symptoms','MedicalHistory','Diagnosis'])
    target=data['Diagnosis']
    finalNumDF=NumPreprocessing(data,NumData)
    ColumnSeparator(data,'MedicalHistory')
    ColumnSeparator(data,'Symptoms')
    categorical_columns = ['Gender', 'Ethnicity', 'EducationLevel', 'Smoking',
                       'FamilyHistoryParkinsons', 'TraumaticBrainInjury', 'Hypertension',
                       'Diabetes', 'Depression', 'Stroke', 'Tremor', 'Rigidity',
                       'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
                       'SleepDisorders', 'Constipation']
    category_Data = data[categorical_columns]
    # print(category_Data.columns)
    # print(NumData.columns)
    encoded_category_data = CategoricalPreprocessing(data, category_Data)
    data=pd.concat([finalNumDF,encoded_category_data],axis=1)

    # print(data.columns)
    data['Diagnosis']=target
    return data
    

In [63]:
def SVM(data):
    y = data['Diagnosis']  
    X = data.drop('Diagnosis', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    svm_results = []
    C_values = [0.1, 1, 10] 
    for C_val in C_values:
        svm = SVC(C=C_val)
        svm.fit(X_train, y_train)
        y_pred = svm.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        svm_results.append({
            'C': C_val,
            'accuracy': accuracy,
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        })

      
    return svm_results

In [64]:
def DT(data):
    X = data.drop('Diagnosis', axis=1)

    y = data['Diagnosis']  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    dt_results = []
    max_depth_values = [3, 5, 10]  
    for max_depth_val in max_depth_values:
        dt = DecisionTreeClassifier(max_depth=max_depth_val)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        dt_results.append({
            'max_depth': max_depth_val,
            'accuracy': accuracy,
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        })

    return dt_results


In [65]:
def GB(data):
    X = data.drop('Diagnosis', axis=1)

    y = data['Diagnosis']  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    gb_results = []
    learning_rate_values = [0.01, 0.1, 0.3]  # ثلاث قيم مختلفة لـ learning_rate
    for lr in learning_rate_values:
        gb = GradientBoostingClassifier(learning_rate=lr)
        gb.fit(X_train, y_train)
        y_pred = gb.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        gb_results.append({
            'learning_rate': lr,
            'accuracy': accuracy,
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        })

    return gb_results

In [66]:
data=PreProcessing()
print(data.head(10))


Index(['Age', 'BMI', 'AlcoholConsumption', 'DietQuality', 'SleepQuality',
       'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL',
       'CholesterolHDL', 'CholesterolTriglycerides', 'UPDRS', 'MoCA',
       'FunctionalAssessment', 'WeeklyPhysicalActivity (hr)'],
      dtype='object')
      UPDRS  FunctionalAssessment      MoCA       Age  AlcoholConsumption  \
0  0.020778              0.535827  0.955136  0.358974            0.307946   
1  0.885720              0.993524  0.677473  0.743590            0.259706   
2  0.669861              0.570783  0.687600  0.897436            0.496248   
3  0.783830              0.725532  0.140790  0.871795            0.944279   
4  0.248810              0.612302  0.716372  0.230769            0.142690   
5  0.415747              0.974350  0.847779  0.487179            0.377717   
6  0.181951              0.876663  0.948996  0.102564            0.889189   
7  0.604146              0.086266  0.229540  0.564103            0.786715   
8  0

In [67]:
svm_results=SVM(data)
print("SVM Results:")
for result in svm_results:
    print(f"C={result['C']}, Accuracy={result['accuracy']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}")

SVM Results:
C=0.1, Accuracy=0.7555555555555555
Confusion Matrix:
[[ 83  86]
 [ 13 223]]
C=1, Accuracy=0.8345679012345679
Confusion Matrix:
[[126  43]
 [ 24 212]]
C=10, Accuracy=0.8271604938271605
Confusion Matrix:
[[126  43]
 [ 27 209]]


In [68]:
gb_results=GB(data)
print("\nGradient Boosting Results:")
for result in gb_results:
    print(f"learning_rate={result['learning_rate']}, Accuracy={result['accuracy']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}")



Gradient Boosting Results:
learning_rate=0.01, Accuracy=0.782716049382716
Confusion Matrix:
[[ 94  75]
 [ 13 223]]
learning_rate=0.1, Accuracy=0.9234567901234568
Confusion Matrix:
[[147  22]
 [  9 227]]
learning_rate=0.3, Accuracy=0.9308641975308642
Confusion Matrix:
[[151  18]
 [ 10 226]]


In [69]:
dt_results=DT(data)
print("\nDecision Tree Results:")
for result in dt_results:
    print(f"max_depth={result['max_depth']}, Accuracy={result['accuracy']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}")


Decision Tree Results:
max_depth=3, Accuracy=0.7876543209876543
Confusion Matrix:
[[ 97  72]
 [ 14 222]]
max_depth=5, Accuracy=0.8765432098765432
Confusion Matrix:
[[151  18]
 [ 32 204]]
max_depth=10, Accuracy=0.9135802469135802
Confusion Matrix:
[[149  20]
 [ 15 221]]
