In [3]:
import pandas as pd

df = pd.read_csv('Parkinsson_disease.csv')

In [13]:
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])

print(df.head())

print("\n Summary statistics for numeric columns:\n")
print(df.describe())

print("\nCheck for missing values:\n")
print(df.isnull().sum())

print("\nGet information on the DataFrame's columns, types, non-null values, etc:\n")print(df.info())

Number of rows: 195
Number of columns: 24
             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status     

In [18]:
import numpy as np
numeric_df = df.select_dtypes(include=[np.number])

Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

# Define the outlier step
outlier_step = 1.5 * IQR

# Identify and remove outliers in the numeric columns
filtered_df = df[~((numeric_df < (Q1 - outlier_step)) | (numeric_df > (Q3 + outlier_step))).any(axis=1)]

# Alternatively, cap the outliers in the numeric columns
capped_df = numeric_df.apply(lambda x: x.clip(lower=x.quantile(0.25) - 1.5 * (x.quantile(0.75) - x.quantile(0.25)),
                                                upper=x.quantile(0.75) + 1.5 * (x.quantile(0.75) - x.quantile(0.25))))

# Replace the original numeric columns in df with the capped values
df.update(capped_df)

# Display the shapes of the original and cleaned dataframes
print("Original Data shape:", df.shape)
print("Data with no outliers shape:", filtered_df.shape)
print("Data with capped outliers shape:", capped_df.shape)

Original Data shape: (195, 24)
Data with no outliers shape: (114, 24)
Data with capped outliers shape: (195, 23)


In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_values = scaler.fit_transform(numeric_df)  # Returns an array

# Create a new DataFrame for the scaled values using the columns from numeric_df
df_scaled_numeric = pd.DataFrame(scaled_values, columns=numeric_df.columns)

# Update the original df with the scaled numeric values
# df.update(df_scaled_numeric)

print(df.head())


             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1    -0.829300     -0.436165     -0.952037        0.334914   
1  phon_R01_S01_2    -0.770972     -0.530974     -0.057721        0.715418   
2  phon_R01_S01_3    -0.909476     -0.723168     -0.109875        0.884991   
3  phon_R01_S01_4    -0.909622     -0.649092     -0.114229        0.775389   
4  phon_R01_S01_5    -0.925657     -0.606245     -0.130608        1.368893   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0          0.749759  0.132963  0.760800    0.131755      0.745985  ...   
1          1.037674  0.453892  1.276809    0.452684      1.681731  ...   
2          1.325589  0.720770  1.585687    0.721813      1.202693  ...   
3          1.325589  0.578885  1.284076    0.577677      1.340396  ...   
4          1.901418  1.095750  2.047187    1.096793      1.836448  ...   

   Shimmer:DDA       NHR       HNR    status      RPDE       DFA   spread1  \
0     0.

In [26]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# Ensure 'status' is an integer type if it's not already
df_scaled_numeric['status'] = df_scaled_numeric['status'].astype(int)

# Separate the features (X) and the target label (y)
X = df_scaled_numeric.drop('status', axis=1)
y = df_scaled_numeric['status']

# Create a RandomForestClassifier model
model = RandomForestClassifier()

# RFE model to select top 10 features, adjust the number as needed
rfe = RFE(estimator=model, n_features_to_select=10)
rfe.fit(X, y)

# Summarize all selected features
selected_features = X.columns[rfe.support_]
print("Selected features by RFE:", selected_features)


Selected features by RFE: Index(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'Jitter:DDP', 'MDVP:APQ',
       'Shimmer:DDA', 'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')


In [27]:
from sklearn.ensemble import RandomForestClassifier

# Fit the Random Forest model
rf = RandomForestClassifier()
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_

# Create a DataFrame for easier visualization
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_sorted = feature_importance.sort_values(by='Importance', ascending=False)

# Select the top 10 most important features, adjust the number as needed
print("Top 10 important features:\n", feature_importance_sorted.head(10))


Top 10 important features:
          Feature  Importance
21           PPE    0.125544
18       spread1    0.112966
0    MDVP:Fo(Hz)    0.093059
2   MDVP:Flo(Hz)    0.060841
19       spread2    0.058013
1   MDVP:Fhi(Hz)    0.046030
12      MDVP:APQ    0.045509
7     Jitter:DDP    0.040051
14           NHR    0.038418
5       MDVP:RAP    0.038368


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Prepare the data
X = df_scaled_numeric[selected_features]
y = df_scaled_numeric['status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict on the testing set
y_pred = log_reg.predict(X_test)

# Evaluate the model
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

          -1       1.00      0.43      0.60         7
           0       0.89      1.00      0.94        32

    accuracy                           0.90        39
   macro avg       0.94      0.71      0.77        39
weighted avg       0.91      0.90      0.88        39

Accuracy: 0.8974358974358975


In [29]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Predict and evaluate
dt_pred = dt.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, dt_pred))
print("Accuracy:", accuracy_score(y_test, dt_pred))

Decision Tree Classification Report:
              precision    recall  f1-score   support

          -1       0.80      0.57      0.67         7
           0       0.91      0.97      0.94        32

    accuracy                           0.90        39
   macro avg       0.86      0.77      0.80        39
weighted avg       0.89      0.90      0.89        39

Accuracy: 0.8974358974358975


In [30]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the KNN model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict and evaluate
knn_pred = knn.predict(X_test)
print("K-Nearest Neighbors Classification Report:")
print(classification_report(y_test, knn_pred))
print("Accuracy:", accuracy_score(y_test, knn_pred))

K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

          -1       0.83      0.71      0.77         7
           0       0.94      0.97      0.95        32

    accuracy                           0.92        39
   macro avg       0.89      0.84      0.86        39
weighted avg       0.92      0.92      0.92        39

Accuracy: 0.9230769230769231


In [31]:
from sklearn.naive_bayes import GaussianNB

# Initialize and train the Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)

# Predict and evaluate
nb_pred = nb.predict(X_test)
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))
print("Accuracy:", accuracy_score(y_test, nb_pred))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

          -1       0.50      0.57      0.53         7
           0       0.90      0.88      0.89        32

    accuracy                           0.82        39
   macro avg       0.70      0.72      0.71        39
weighted avg       0.83      0.82      0.83        39

Accuracy: 0.8205128205128205


In [32]:
from sklearn.model_selection import GridSearchCV

# Parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Grid search
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best score from grid search: ", grid_search.best_score_)


Fitting 3 folds for each of 36 candidates, totalling 108 fits
Best parameters found:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best score from grid search:  0.9166666666666666
[CV] END max_depth=None, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, min_samples_split=10, n_estimators=200; total time=   0.9s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time=   0.9s
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=20, min_samples_split=10, n_estimators=50