In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ast
import time
import joblib
import warnings

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

warnings.filterwarnings('ignore')

In [None]:
test_data=pd.read_csv('/content/test_10percent.csv')
test_data

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,DietQuality,SleepQuality,...,CholesterolHDL,CholesterolTriglycerides,UPDRS,MoCA,FunctionalAssessment,DoctorInCharge,WeeklyPhysicalActivity (hr),MedicalHistory,Symptoms,Diagnosis
0,3643,77,Male,African American,Higher,37.952303,Yes,12.285164,9.522574,8.907549,...,73.402967,367.964942,152.646562,16.806433,6.673464,DrXXXConfid,08:08,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'Yes', 'Rigidity': 'No', 'Bradykine...",1
1,4371,60,Male,African American,Bachelor's,36.819964,No,17.381470,1.579192,7.187810,...,74.542203,71.422351,103.674400,15.776162,3.809965,DrXXXConfid,07:03,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'No', 'Rigidity': 'Yes', 'Bradykine...",1
2,4039,57,Female,Caucasian,Bachelor's,28.813766,No,18.567038,3.793122,7.417207,...,47.364596,92.910793,96.463332,12.899094,5.437471,DrXXXConfid,01:08,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'Yes', 'Rigidity': 'No', 'Bradykine...",1
3,4961,57,Female,African American,High School,28.035376,No,17.562795,6.572070,5.458202,...,59.593550,334.471724,153.451078,8.800971,0.205773,DrXXXConfid,05:38,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'Yes', 'Rigidity': 'No', 'Bradykine...",1
4,3980,89,Female,Caucasian,Bachelor's,36.711445,No,5.587417,9.764075,7.621980,...,22.407366,59.788133,106.502196,27.735194,8.694715,DrXXXConfid,01:23,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'No', 'Rigidity': 'Yes', 'Bradykine...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,3902,79,Female,Caucasian,High School,20.950594,No,12.046259,8.979096,8.436992,...,27.762064,62.017761,128.270419,16.756778,2.933690,DrXXXConfid,02:21,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'Yes', 'Rigidity': 'No', 'Bradykine...",1
199,4985,86,Male,Asian,High School,24.623583,No,11.836260,2.446423,7.624552,...,63.235489,301.631854,11.734058,8.774813,1.975635,DrXXXConfid,03:09,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'No', 'Rigidity': 'No', 'Bradykines...",0
200,4737,62,Female,African American,High School,29.477867,No,4.816119,7.049134,6.131714,...,79.276161,203.097139,4.624267,22.876812,7.767030,DrXXXConfid,05:34,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'No', 'Rigidity': 'No', 'Bradykines...",0
201,3386,51,Male,Asian,High School,28.635074,Yes,12.809181,5.786899,4.042542,...,25.548603,195.025563,197.379951,23.013678,2.030421,DrXXXConfid,01:53,"{'FamilyHistoryParkinsons': 'No', 'TraumaticBr...","{'Tremor': 'Yes', 'Rigidity': 'No', 'Bradykine...",1


In [None]:
test_data.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'DietQuality', 'SleepQuality',
       'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL',
       'CholesterolHDL', 'CholesterolTriglycerides', 'UPDRS', 'MoCA',
       'FunctionalAssessment', 'DoctorInCharge', 'WeeklyPhysicalActivity (hr)',
       'MedicalHistory', 'Symptoms', 'Diagnosis'],
      dtype='object')

In [None]:
label=test_data['Diagnosis']

In [None]:
label

Unnamed: 0,Diagnosis
0,1
1,1
2,1
3,1
4,0
...,...
198,1
199,0
200,0
201,1


In [None]:
test_data.drop(['Diagnosis'], axis=1, inplace=True)

In [None]:
test_data.isnull().sum()

Unnamed: 0,0
PatientID,0
Age,0
Gender,0
Ethnicity,0
EducationLevel,32
BMI,0
Smoking,0
AlcoholConsumption,0
DietQuality,0
SleepQuality,0


In [None]:
categorical_cols = test_data.select_dtypes(include=['object', 'category']).columns
numerical_cols = test_data.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns

print(f"Categorical columns: {list(categorical_cols)}")
print(f"Numerical columns: {list(numerical_cols)}")


if len(categorical_cols) > 0:
  cat_imputer = SimpleImputer(strategy='most_frequent')
  test_data[categorical_cols] = cat_imputer.fit_transform(test_data[categorical_cols])

if len(numerical_cols) > 0:
  num_imputer = SimpleImputer(strategy='mean')
  test_data[numerical_cols] = num_imputer.fit_transform(test_data[numerical_cols])

Categorical columns: ['Gender', 'Ethnicity', 'EducationLevel', 'Smoking', 'DoctorInCharge', 'WeeklyPhysicalActivity (hr)', 'MedicalHistory', 'Symptoms']
Numerical columns: ['PatientID', 'Age', 'BMI', 'AlcoholConsumption', 'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'UPDRS', 'MoCA', 'FunctionalAssessment']


In [None]:
test_data.isnull().sum()

Unnamed: 0,0
PatientID,0
Age,0
Gender,0
Ethnicity,0
EducationLevel,0
BMI,0
Smoking,0
AlcoholConsumption,0
DietQuality,0
SleepQuality,0


In [None]:
test_data.duplicated().sum()

np.int64(0)

In [None]:
if test_data.duplicated().sum() > 0:
    test_data.drop_duplicates(inplace=True)

In [None]:
print(test_data.columns)

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'DietQuality', 'SleepQuality',
       'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL',
       'CholesterolHDL', 'CholesterolTriglycerides', 'UPDRS', 'MoCA',
       'FunctionalAssessment', 'DoctorInCharge', 'WeeklyPhysicalActivity (hr)',
       'MedicalHistory', 'Symptoms'],
      dtype='object')


In [None]:
test_data['MedicalHistory'] = test_data['MedicalHistory'].apply(ast.literal_eval)
test_data['Symptoms'] =test_data['Symptoms'].apply(ast.literal_eval)

medical_df = test_data['MedicalHistory'].apply(pd.Series)
symptoms_df = test_data['Symptoms'].apply(pd.Series)

test_data.drop(['MedicalHistory', 'Symptoms'], axis=1, inplace=True)

test_data = pd.concat([test_data, medical_df, symptoms_df], axis=1)
test_data

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,DietQuality,SleepQuality,...,Diabetes,Depression,Stroke,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation
0,3643.0,77.0,Male,African American,Higher,37.952303,Yes,12.285164,9.522574,8.907549,...,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes
1,4371.0,60.0,Male,African American,Bachelor's,36.819964,No,17.381470,1.579192,7.187810,...,No,No,No,No,Yes,No,No,No,Yes,No
2,4039.0,57.0,Female,Caucasian,Bachelor's,28.813766,No,18.567038,3.793122,7.417207,...,No,No,No,Yes,No,No,No,No,Yes,Yes
3,4961.0,57.0,Female,African American,High School,28.035376,No,17.562795,6.572070,5.458202,...,No,No,No,Yes,No,Yes,No,No,No,Yes
4,3980.0,89.0,Female,Caucasian,Bachelor's,36.711445,No,5.587417,9.764075,7.621980,...,No,No,No,No,Yes,No,No,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,3902.0,79.0,Female,Caucasian,High School,20.950594,No,12.046259,8.979096,8.436992,...,No,No,No,Yes,No,No,No,No,No,No
199,4985.0,86.0,Male,Asian,High School,24.623583,No,11.836260,2.446423,7.624552,...,No,No,No,No,No,No,No,Yes,No,No
200,4737.0,62.0,Female,African American,High School,29.477867,No,4.816119,7.049134,6.131714,...,Yes,No,No,No,No,No,No,No,Yes,No
201,3386.0,51.0,Male,Asian,High School,28.635074,Yes,12.809181,5.786899,4.042542,...,No,No,No,Yes,No,No,No,No,No,No


In [None]:
yes_no_columns = [
    'FamilyHistoryParkinsons', 'TraumaticBrainInjury', 'Smoking',
    'Hypertension', 'Diabetes', 'Depression', 'Stroke',
    'Tremor', 'Rigidity', 'Bradykinesia', 'PosturalInstability',
    'SpeechProblems', 'SleepDisorders', 'Constipation'
]

test_data[yes_no_columns] =test_data[yes_no_columns].applymap(lambda x: 1 if x == 'Yes' else 0)
test_data[yes_no_columns].fillna(test_data[yes_no_columns].mode().iloc[0], inplace=True)

In [None]:
test_data['Gender'] = test_data['Gender'].map({'Female': 0, 'Male': 1})
test_data['Gender'].fillna(test_data['Gender'].mode()[0], inplace=True)


In [None]:
education_map = {
    'High School': 0,
    "Bachelor's": 1,
    'Higher': 2
}
test_data['EducationLevel'] = test_data['EducationLevel'].map(education_map)
test_data['EducationLevel'].fillna(test_data['EducationLevel'].mode()[0], inplace=True)

In [None]:
ethnicity_map = {
    'Caucasian': 0,
    'Asian': 1,
    'African American': 2,
    'Other': 3
}
test_data['Ethnicity'] = test_data['Ethnicity'].map(ethnicity_map)
test_data['Ethnicity'].fillna(test_data['Ethnicity'].mode()[0], inplace=True)

In [None]:
test_data['DoctorInCharge'] = 1
test_data['DoctorInCharge'].fillna(test_data['DoctorInCharge'].mode()[0], inplace=True)

In [None]:
def convert_time_to_hours(time_str):
    try:
        h, m = map(int, time_str.split(':'))
        return h + m / 60
    except:
        return np.nan

test_data['WeeklyPhysicalActivity (hr)'] =test_data['WeeklyPhysicalActivity (hr)'].apply(convert_time_to_hours)
test_data['WeeklyPhysicalActivity (hr)']

Unnamed: 0,WeeklyPhysicalActivity (hr)
0,8.133333
1,7.050000
2,1.133333
3,5.633333
4,1.383333
...,...
198,2.350000
199,3.150000
200,5.566667
201,1.883333


In [None]:
test_data.isna().sum()

Unnamed: 0,0
PatientID,0
Age,0
Gender,0
Ethnicity,0
EducationLevel,0
BMI,0
Smoking,0
AlcoholConsumption,0
DietQuality,0
SleepQuality,0


In [None]:
test_data.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'AlcoholConsumption', 'DietQuality', 'SleepQuality',
       'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL',
       'CholesterolHDL', 'CholesterolTriglycerides', 'UPDRS', 'MoCA',
       'FunctionalAssessment', 'DoctorInCharge', 'WeeklyPhysicalActivity (hr)',
       'FamilyHistoryParkinsons', 'TraumaticBrainInjury', 'Hypertension',
       'Diabetes', 'Depression', 'Stroke', 'Tremor', 'Rigidity',
       'Bradykinesia', 'PosturalInstability', 'SpeechProblems',
       'SleepDisorders', 'Constipation'],
      dtype='object')

In [None]:
targetcolumns =['UPDRS',
  'Tremor',
  'Rigidity',
  'Bradykinesia',
  'PosturalInstability',
  'Age',
  'Depression',
  'Diabetes',
  'Stroke',
  'SleepQuality',
  'DiastolicBP',
  'MoCA',
  'FunctionalAssessment']

In [None]:
test_data = test_data[targetcolumns]
test_data

Unnamed: 0,UPDRS,Tremor,Rigidity,Bradykinesia,PosturalInstability,Age,Depression,Diabetes,Stroke,SleepQuality,DiastolicBP,MoCA,FunctionalAssessment
0,152.646562,1,0,1,0,77.0,0,0,0,8.907549,69.0,16.806433,6.673464
1,103.674400,0,1,0,0,60.0,0,0,0,7.187810,83.0,15.776162,3.809965
2,96.463332,1,0,0,0,57.0,0,0,0,7.417207,62.0,12.899094,5.437471
3,153.451078,1,0,1,0,57.0,0,0,0,5.458202,95.0,8.800971,0.205773
4,106.502196,0,1,0,0,89.0,0,0,0,7.621980,109.0,27.735194,8.694715
...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,128.270419,1,0,0,0,79.0,0,0,0,8.436992,70.0,16.756778,2.933690
199,11.734058,0,0,0,0,86.0,0,0,0,7.624552,65.0,8.774813,1.975635
200,4.624267,0,0,0,0,62.0,0,1,0,6.131714,61.0,22.876812,7.767030
201,197.379951,1,0,0,0,51.0,0,0,0,4.042542,99.0,23.013678,2.030421


In [None]:
scaler = StandardScaler()
scaler.fit(test_data)

test_data = scaler.transform(test_data)

In [None]:
test_data

array([[ 0.89069084,  1.14316149, -0.56407607, ..., -1.18591784,
         0.28800249,  0.62317271],
       [ 0.04586319, -0.87476705,  1.77281052, ..., -0.37728576,
         0.16877937, -0.4024556 ],
       [-0.07853625,  1.14316149, -0.56407607, ..., -1.59023388,
        -0.16415565,  0.18047331],
       ...,
       [-1.66286858, -0.87476705, -0.56407607, ..., -1.64799332,
         0.99046816,  1.01485885],
       [ 1.66239462,  1.14316149, -0.56407607, ...,  0.54686519,
         1.00630631, -1.03984039],
       [ 0.78909081, -0.87476705, -0.56407607, ...,  0.66238405,
         1.50617392, -1.6428697 ]])

In [None]:
model = joblib.load('/content/RandomforestClassifier.pkl')
prediction = model.predict(test_data)
prediction_series = pd.Series(prediction, name='prediction')
prediction_series.to_csv('prediction.csv', index=False)


In [None]:
prediction=pd.read_csv('/content/prediction.csv')
prediction

Unnamed: 0,prediction
0,1
1,1
2,1
3,1
4,0
...,...
198,1
199,0
200,0
201,1


In [None]:
prediction.isnull().sum()

Unnamed: 0,0
prediction,0


In [None]:
count=0
for i in range(len(prediction)):
  if prediction['prediction'][i]==label[i]:
    count+=1

In [None]:
print(count/len(prediction)*100)

95.07389162561576


In [None]:
acc=accuracy_score(label,prediction)
print(acc)

0.9507389162561576
