In [1]:
print("Hello World!")

Hello World!


In [9]:
import pandas as pd  # For loading and manipulating the dataset
import numpy as np   # For numerical operations
from sklearn.model_selection import train_test_split  # To split the data into training and testing sets
from sklearn.preprocessing import StandardScaler  # To standardize the features
from sklearn.ensemble import RandomForestClassifier  # The machine learning model we'll use
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, recall_score  # For evaluating the model
from imblearn.under_sampling import TomekLinks  # To handle imbalanced data

In [10]:
parkinsons_dataset=pd.read_csv("../DATASETS/parkinsons.csv")

In [11]:
parkinsons_dataset.head(9)

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335
5,phon_R01_S01_6,120.552,131.162,113.787,0.00968,8e-05,0.00463,0.0075,0.01388,0.04701,...,0.06985,0.01222,21.378,1,0.415564,0.825069,-4.242867,0.299111,2.18756,0.357775
6,phon_R01_S02_1,120.267,137.244,114.82,0.00333,3e-05,0.00155,0.00202,0.00466,0.01608,...,0.02337,0.00607,24.886,1,0.59604,0.764112,-5.634322,0.257682,1.854785,0.211756
7,phon_R01_S02_2,107.332,113.84,104.315,0.0029,3e-05,0.00144,0.00182,0.00431,0.01567,...,0.02487,0.00344,26.892,1,0.63742,0.763262,-6.167603,0.183721,2.064693,0.163755
8,phon_R01_S02_3,95.73,132.068,91.754,0.00551,6e-05,0.00293,0.00332,0.0088,0.02093,...,0.03218,0.0107,21.812,1,0.615551,0.773587,-5.498678,0.327769,2.322511,0.231571


In [12]:
parkinsons_dataset.shape

(195, 24)

In [13]:
parkinsons_dataset["status"].value_counts()

status
1    147
0     48
Name: count, dtype: int64

In [16]:

X = parkinsons_dataset.drop(columns=["name", "status"])


y = parkinsons_dataset["status"]


print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (195, 22)
Target shape: (195,)


In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [21]:
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (156, 22)
Testing data shape: (39, 22)


We standardize the features to have a mean of 0 and a variance of 1. This helps the model perform better because all features are on the same scale.

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

We use the Random Forest Classifier because:

It is a powerful and versatile algorithm.

It handles both numerical and categorical data well.

It reduces overfitting by averaging multiple decision trees.

In [23]:

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

Confusion Matrix:
 [[ 5  2]
 [ 0 32]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       0.94      1.00      0.97        32

    accuracy                           0.95        39
   macro avg       0.97      0.86      0.90        39
weighted avg       0.95      0.95      0.95        39

Accuracy: 0.9487179487179487
Recall: 1.0
F1-Score: 0.9696969696969697


In [24]:
# Apply Tomek Links to undersample the majority class
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_train, y_train)

# Check the new distribution of the target variable
print("Resampled target distribution:\n", y_resampled.value_counts())

Resampled target distribution:
 status
1    113
0     41
Name: count, dtype: int64


In [25]:
# Train the model on the resampled data
model_resampled = RandomForestClassifier(random_state=42)
model_resampled.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_resampled = model_resampled.predict(X_test)

# Evaluate the model
print("Confusion Matrix (After Tomek Links):\n", confusion_matrix(y_test, y_pred_resampled))
print("\nClassification Report (After Tomek Links):\n", classification_report(y_test, y_pred_resampled))
print("Accuracy (After Tomek Links):", accuracy_score(y_test, y_pred_resampled))
print("Recall (After Tomek Links):", recall_score(y_test, y_pred_resampled))
print("F1-Score (After Tomek Links):", f1_score(y_test, y_pred_resampled))

Confusion Matrix (After Tomek Links):
 [[ 5  2]
 [ 0 32]]

Classification Report (After Tomek Links):
               precision    recall  f1-score   support

           0       1.00      0.71      0.83         7
           1       0.94      1.00      0.97        32

    accuracy                           0.95        39
   macro avg       0.97      0.86      0.90        39
weighted avg       0.95      0.95      0.95        39

Accuracy (After Tomek Links): 0.9487179487179487
Recall (After Tomek Links): 1.0
F1-Score (After Tomek Links): 0.9696969696969697


In [26]:
import joblib

# Save the model
joblib.dump(model_resampled, "parkinsons_model.pkl")

# Load the model (when needed)
# loaded_model = joblib.load("parkinsons_model.pkl")

['parkinsons_model.pkl']