In [1]:
# NumPy
import numpy as np # linear algebra

# Dataframe operations
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Scalers
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.regularizers import l2
from sklearn.ensemble import RandomForestClassifier

# Model Helper Functions
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Attribute Definitions ##

Matrix column entries (attributes):

+ name - ASCII subject name and recording number
+ MDVP:Fo(Hz) - Average vocal fundamental frequency
+ MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
+ MDVP:Flo(Hz) - Minimum vocal fundamental frequency
+ MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency
+ MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
+ NHR,HNR - Two measures of ratio of noise to tonal components in the voice status
+ Health status of the subject (one) - Parkinson's, (zero) - healthy
+ RPDE,D2 - Two nonlinear dynamical complexity measures
+ DFA - Signal fractal scaling exponent
+ spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation

# Load Dataset

In [2]:
data = pd.read_csv('/kaggle/input/parkinson-disease-detection/Parkinsson disease.csv')
data.head()
# print(data.describe(include='all'))

# Check datatypes

In [3]:
data.info()

# Plot up features

In [4]:
fig,axes = plt.subplots(5,5,figsize=(20,20))
axes=axes.flatten()

for i in range(1, len(data.columns)-1):
    sns.violinplot(x='status', y=data.iloc[:,i], data=data, orient='v', ax=axes[i])
plt.tight_layout()
plt.show()
plt.close()

# Define features and target arrays

In [5]:
X = data.iloc[:,[1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,18,19,20,21,22,23]]
# X = data.iloc[:,[1,2,3,4,5,6,7,8,9,10]]
y = data.iloc[:,17]

percent_has_parkinsons = round(sum(data.status)/len(data)*100, 2)
print(f"{percent_has_parkinsons} percept of examples in dataset have Parkinsons.")

data.status.value_counts()


# Split dataset into training and validation arrays

In [6]:
# Split dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

# Scale Features

In [7]:
# Scale Data

# Fit on training set only:
scaler = StandardScaler()
scaler.fit(X_train)

# Apply transform to both the training set and the test set:
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

# Models

## 1. Logistic Regression

In [8]:
model_cm_dict = dict()

model_lr = LogisticRegression(max_iter=500)
model_lr.fit(X_train, y_train)

# Predict y for validation set
y_pred_lr = model_lr.predict(X_val)

# Evaluate Model Accuracy
cm_lr = metrics.confusion_matrix(y_val, y_pred_lr)
tn_lr, fp_lr, fn_lr, tp_lr = cm_lr.ravel()
accuracy = round(metrics.accuracy_score(y_val, y_pred_lr), 2)
print(f'Model accuracy: {accuracy*100}')
print(f'Confusion Matrix: \n{cm_lr}')

## 2. K Nearest Neighbors

In [9]:
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)

# Predict y for validation set
y_pred_knn = model_knn.predict(X_val)

# Evaluate Model Accuracy
cm_knn = metrics.confusion_matrix(y_val, y_pred_knn)
tn_knn, fp_knn, fn_knn, tp_knn = cm_knn.ravel()
accuracy = round(metrics.accuracy_score(y_val, y_pred_knn), 2)
print(f'Model accuracy: {accuracy*100}')
print(f'Confusion Matrix: \n{cm_knn}')

## 3. Neural Network

In [10]:
# Build neural network

model_nn = Sequential([
    Dense(units=25, kernel_regularizer=l2(0.001), activation='relu'),
    Dense(units=15, kernel_regularizer=l2(0.001), activation='relu'),
    Dense(units=1, activation='sigmoid')
])

In [11]:
# Train neural network

model_nn.compile(optimizer='adam', loss=BinaryCrossentropy())
model_nn.fit(X_train,y_train, epochs=100)

In [13]:
# Make predictions using model
    
y_pred_nn = model_nn.predict(X_val)
for i in range(len(y_pred_nn)):
    if y_pred_nn[i] >= 0.5:
        y_pred_nn[i] = 1
    else:
        y_pred_nn[i] = 0

In [14]:
# Evaluate Model Accuracy
cm_nn = metrics.confusion_matrix(y_val, y_pred_nn)
tn_nn, fp_nn, fn_nn, tp_nn = cm_nn.ravel()
accuracy = round(metrics.accuracy_score(y_val, y_pred_nn), 2)
print(f'Model accuracy: {accuracy*100}')
print(f'Confusion Matrix: \n{cm_nn}')

## 4. Random Forest

In [16]:
model_rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_val)
for i in range(len(y_pred_rf)):
    if y_pred_rf[i] >= 0.5:
        y_pred_rf[i] = 1
    else:
        y_pred_rf[i] = 0


# Evaluate Model Accuracy
cm_rf = metrics.confusion_matrix(y_val, y_pred_rf)
tn_rf, fp_rf, fn_rf, tp_rf = cm_rf.ravel()
accuracy = round(metrics.accuracy_score(y_val, y_pred_rf), 2)
print(f'Model accuracy: {accuracy*100}')
print(f'Confusion Matrix: \n{cm_rf}')

# mae = mean_absolute_error(y_val, y_pred_rf)
# print(f"mean absolute error: {mae}")

# Compare Model Results

In [17]:
# Plot Confusion Matrix

fig,axes = plt.subplots(1,4,figsize=(35,6), sharey=True)
# axes=axes.flatten()
group_names = ['True Neg','False Pos','False Neg','True Pos']

# Logistic Regression
sns.heatmap(cm_lr, ax=axes[0], annot=True, cmap='Blues', )
axes[0].set_title('Logistic Regression')
axes[0].set(ylabel='True Label', xlabel="Predicted Label")

# KNN
sns.heatmap(cm_knn, ax=axes[1], annot=True, cmap='Blues')
axes[1].set_title('K Nearest Neighbor')
axes[1].set(ylabel='True Label', xlabel="Predicted Label")

# Neural Network
sns.heatmap(cm_nn, ax=axes[2], annot=True, cmap='Blues')
axes[2].set_title('Neural Network')
axes[2].set(ylabel='True Label', xlabel="Predicted Label")

# Random Forest
sns.heatmap(cm_rf, ax=axes[3], annot=True, cmap='Blues')
axes[3].set_title('Random Forest')
axes[3].set(ylabel='True Label', xlabel="Predicted Label")