install requirements

In [None]:
pip install -r ../requirements.txt

Imports libraries 

In [None]:
import pandas as pd
from IPython.display import display

cleaning data
1. Load Data
2. convert PatientID to Index
    We dont want this number to interrupt later analysis. so, instead of dropping it, we set it as the Index.
3. Remove Irrelevant Columns
    drop DoctorInCharge (all the same)
4. Remove Duplicates
5. Save Cleaned Data
    NEW FILE CALLED: parkinsons_cleaned.csv

In [None]:
import pandas as pd
import os

# --- 1. Load Data ---
# --- 2. PatientID to Index ---
df = pd.read_csv('../data/parkinsons_disease_data.csv', index_col='PatientID')

# --- 3. Remove Irrelevant Columns ---
cols_to_drop = ['DoctorInCharge']
df_clean = df.drop(columns=cols_to_drop, errors='ignore')

# --- 4. Remove Duplicates ---
df_clean = df_clean.drop_duplicates()

# --- 5. Save Cleaned Data ---
# index=True to save the IDs back to the CSV
output_path = '../data/parkinsons_cleaned.csv'
df_clean.to_csv(output_path, index=True) 

print("Cleaned data saved.")
display(df.head())


Cleaned data saved.


Unnamed: 0_level_0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,DoctorInCharge
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3058,85,0,3,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,1.572427,1,0,0,0,0,0,0,0,DrXXXConfid
3059,75,0,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,4.787551,0,1,0,1,0,1,0,1,DrXXXConfid
3060,70,1,0,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,2.130686,1,0,0,0,1,0,1,1,DrXXXConfid
3061,52,0,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,3.391288,1,1,1,0,0,0,1,1,DrXXXConfid
3062,87,0,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,3.200969,0,0,0,1,0,1,0,0,DrXXXConfid


Outlier:
Since this is synthetic data, we expect clean ranges but we still want to cheak it. We print this just to verify that data loaded correctly.
searching for (there are more options):
1. bmi out of 5-40
2. negetive age or over 120 :)
3. SystolicBP - under 80-90 or over 220-250

In [17]:
#--- Outlier Detection (Sanity Check) ---
print("Data Statistics Check:")
display(df_clean[['Age', 'BMI', 'SystolicBP']].describe())


Data Statistics Check:


Unnamed: 0,Age,BMI,SystolicBP
count,2105.0,2105.0,2105.0
mean,69.6019,27.209493,133.719715
std,11.594511,7.208099,26.502355
min,50.0,15.008333,90.0
25%,60.0,20.782176,110.0
50%,70.0,27.184571,133.0
75%,80.0,33.462452,157.0
max,89.0,39.999887,179.0


Create a copy for human analysis
1. make a copy - This file will be the "Input" for the next notebook (analysis).
2. change Etnicity for nominal value (not numbers)
3. save it to NEW CSV CALLED: parkinsons_for_analysis.csv

In [None]:
# copy for analysis
df_analysis = df_clean.copy()

# Map labels for ethnicity
ethnicity_map = {0: 'Caucasian', 1: 'African_American', 2: 'Asian', 3: 'Other'} 
df_analysis['Ethnicity'] = df_analysis['Ethnicity'].map(ethnicity_map)

print("Data ready for visualization:")
display(df_analysis.head())

# --- Save the analysis-Ready Data ---
# save dataframe to a NEW file.
output_path = '../data/parkinsons_for_analysis.csv'

# index=True to keep the PatientID accessible.
df_analysis.to_csv(output_path, index=True)

print(f"File saved successfully: {output_path}")
print("You can now open the next notebook and load this file.")

Data ready for visualization:


Unnamed: 0_level_0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,MoCA,FunctionalAssessment,Tremor,Rigidity,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3058,85,0,Other,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,...,29.181289,1.572427,1,0,0,0,0,0,0,0
3059,75,0,Caucasian,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,...,12.332639,4.787551,0,1,0,1,0,1,0,1
3060,70,1,Caucasian,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,...,29.927783,2.130686,1,0,0,0,1,0,1,1
3061,52,0,Caucasian,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,...,21.304268,3.391288,1,1,1,0,0,0,1,1
3062,87,0,Caucasian,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,...,8.336364,3.200969,0,0,0,1,0,1,0,0


File saved successfully: ../data/parkinsons_for_analysis.csv
You can now open the next notebook and load this file.


Create a copy for human analysis
1. make a copy
2. change Etnicity for 3 different coloum of true or false
3. save it to NEW CSV CALLED: parkinsons_for_model.csv

In [30]:
# One-Hot Encoding for the model
df_model = pd.get_dummies(df_clean, columns=['Ethnicity'], drop_first=False, dtype=int) #copy already happen in grt_dummies

print("Data ready for Machine Learning:")
display(df_model.head())

# Save logic can go here or in a separate cell
df_model.to_csv('../data/parkinsons_for_model.csv', index=True)

# --- Save the Model-Ready Data ---
# save dataframe to a NEW file.
# This file will be the "Input" for the next notebook (Machine Learning).
output_path = '../data/parkinsons_for_model.csv'

# index=True is important here! We want to keep the PatientID accessible.
df_model.to_csv(output_path, index=True)

print(f"File saved successfully: {output_path}")
print("You can now open the next notebook and load this file.")

Data ready for Machine Learning:


Unnamed: 0_level_0,Age,Gender,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryParkinsons,...,Bradykinesia,PosturalInstability,SpeechProblems,SleepDisorders,Constipation,Diagnosis,Ethnicity_0,Ethnicity_1,Ethnicity_2,Ethnicity_3
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3058,85,0,1,19.619878,0,5.108241,1.38066,3.893969,9.283194,0,...,0,0,0,0,0,0,0,0,0,1
3059,75,0,2,16.247339,1,6.027648,8.409804,8.513428,5.60247,0,...,0,1,0,1,0,1,1,0,0,0
3060,70,1,0,15.368239,0,2.242135,0.213275,6.498805,9.929824,0,...,0,0,1,0,1,1,1,0,0,0
3061,52,0,0,15.454557,0,5.997788,1.375045,6.715033,4.196189,0,...,1,0,0,0,1,1,1,0,0,0
3062,87,0,1,18.616042,0,9.775243,1.188607,4.657572,9.363925,0,...,0,1,0,1,0,0,1,0,0,0


File saved successfully: ../data/parkinsons_for_model.csv
You can now open the next notebook and load this file.
