In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import IFrame
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, ShuffleSplit
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import plot_confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import joblib
from sklearn.metrics import ConfusionMatrixDisplay

import warnings
warnings.filterwarnings('ignore')
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
#Reading the dataset
"""
For every individual there are 6 records and 22 factors measured against it.
"""
parkinsons = pd.read_table('parkinsons.data',sep=',')

In [None]:
# Exploring Dataset Content

parkinsons.head(10)

In [None]:
parkinsons.tail()

In [None]:
parkinsons.shape

In [None]:
parkinsons.info()

In [None]:
parkinsons.isna().sum()

In [None]:
print('Number of Features In Dataset :', parkinsons.shape[1])
print('Number of Instances In Dataset : ', parkinsons.shape[0])

In [None]:
#first column of data contains name and recording number, I thought it is a good idea to split the name and recording
#number, just in case it might be beneficial for future groupings

parkinsons.insert(1, 'Recording',0)
#splitting the data with last underscore
parkinsons[['name','Recording']] = parkinsons.name.str.rsplit('_', 1, expand=True)

In [None]:
#copying status column to new Status column at 3rd location for better visibility and understanding of the dataset
parkinsons.insert(2, 'Status',parkinsons['status'])

In [None]:
#dropping the redundant old status column
parkinsons = parkinsons.drop(['status'], axis=1)

In [None]:
parkinsons.columns

In [None]:
parkinsons.info()

In [None]:
# Checking For Duplicate Rows In Dataset
print('Number of Duplicated Rows :',parkinsons.duplicated().sum())

In [None]:
#Final dataframe
parkinsons.head()

In [None]:
# The data descriptions from the parkinsons.names file. You can find a table in next code block with all feature details
description = pd.read_table('parkinsons.names', error_bad_lines=False, warn_bad_lines=False)
description = description.iloc[34:47].reset_index().drop(['index'], axis=1)

In [None]:
description

In [None]:
#Feature Details from ncbi website
from IPython.display import IFrame
IFrame(src='https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5434464/table/tab2/?report=objectonly', width=700, height=600)

In [None]:
parkinsons.info()

## Data Exploration

In [None]:
#Basic data exploration, checking for NAN, shape and size

#Size of data
print(f'Size of data : {parkinsons.size}')
#Shape of data
print(f'Shape of data : {parkinsons.shape}')
#Null values
print(f'Number of null values: {parkinsons.isnull().sum().sum()}')


#Exploring the dataset to check how many pateint records are there with parkinsons
healthy = (parkinsons['Status'] == 0).sum()
print(f'Number of pateint records without parkinsons: {healthy}')
unhealthy = (parkinsons['Status'] == 1).sum()
print(f'Number of pateint records with parkinsons: {unhealthy}')

# first 3 columns are name, recording and status, we check for features after 3rd column
print(f'Number of features: {len(parkinsons.columns[3:])}')

In [None]:
""" 
Using describe to check the spread of values over the dataset.Though the data seems to be highly dispersed over a wide range of
numbers. I choose not to normalise the dataset becuase of 2 reasons. First, before taking the measurements for individuals the
amplitude of each signal has already been already been digitally normalized in order to suppress the effects of 
individual difference. Second, not all features are measured over same scale, Amplitude is 
measured in Hz(hertz), Jitter in percentage and Shimmer in dB(decibel.) and so on. Normalising it will scale it down to a 
similar scale and as the dataset is itself very small, there might be a situation where after normalisation we can lose 
some important data.
"""
parkinsons.describe()

In [None]:
# To check the distribution on status over dataset, a boxplot is shown.. It clearly shows that we have very less data
# for individuals with parkinsons and individuals not with parkinson. 0 implies indivduals not having parkinsons and
# Status of 1 implies individuals with parkinsons


count = sns.countplot(x=parkinsons['Status'], label= 'Count',  palette=['#2BD957', '#54DEE4'])

In [None]:
#Defining columns for heatmap
cols = ['Status','MDVP:Fo(Hz)','MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)', 'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 
        'Jitter:DDP','MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ', 'Shimmer:DDA', 'NHR', 
        'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2','PPE']

## I wanted to check how the data is co-related so first option was to create a pairplot for all the features with respect to Status column of parkinsons dataframe
Orange color in the distribution refers to pateinst with status 1 i.e. having Parkinsons and blue with status 0 i.e. Healthy.
As, the data is not evenly 


In [None]:
# setting figsize
plt.figure(figsize=(20, 20))
sns.pairplot(data=parkinsons[cols],hue = 'Status')
plt.savefig('parkinsonspairplot.png',format='png')

#For greater visibility the image is stored so that it can be zoomed in

# Second option to check the corelation between featrues was to plot a heatmap.

Values which are closer to -1 shows that there is no linear corelation between the 2 factors. Whereas, the values which are
closer to one shows that the features are directly proportional to the each other (If there is an incrase in value of one
feature the other value also increases). In this plot below, the darker the image the greater is the co-relation.

We can see 2 dark blue squares in the heatmap below they are because of the fact that they are plotted against the similar
measures. The first blue block at left can be seen for values of Jitter whicg are highly co-related between each other.
Similarly, the second blue block is for measures of shimmer.

A corelation score of greater than 0.6 is what we are looking for in the heatmap below, but as can be seen we have a huge 
number of relations to describe for this heatmap. I have selected two of them to give a probable description of the relations.

- NHR - MDVP Jitter% : (0.91)
  NHR is the ratio of Noise to Harmonic Ratio. And Jitter is the modulation of periodicity of voice signals.
  A high degree of jitter results in hoarseness and this can explains well that NHR to be higher and ratio of HNR to be lower 
  which is the exact opposite of NHR. A lower NHR and higher HNR implies greater noise quality i.e with less jitter. Hence,
  jitter is directly proportional to NHR and inversely proportional to HNR.
 
  
- PPE - spread1 : (0.96)
  PPE represents the inefficiency of voice frequency control.There is a high co-relation score between PPE(pitch period entropy)
  and spread 1 which is one of the measures of non-linear fundamental frequency variation. I cannot find why and how they are
  co-related but few more reads and digging into details might give some insight.


In [None]:
plt.figure(figsize=(20,20))

#Calculating the corelations for the heatmap
c = parkinsons[cols].corr()
sns.heatmap(c,annot=True,cmap="YlGnBu")
plt.show()

In [None]:
# count number of observations in each class
normalp, parkinsonsp = parkinsons['Status'].value_counts()
print('Number of normal individual: ', normalp)
print('Number of individuals with parkinsons : ', parkinsonsp)
print('')
print('% of normal individual ', round(normalp / len(parkinsons) * 100, 2), '%')
print('% of of individuals with parkinsons', round(parkinsonsp / len(parkinsons) * 100, 2), '%')

# Trying different clasifier
### Evaluation function for models

The code for model evaluation has been used from: https://github.com/fenna/student_BFVM19DATASC3/blob/main/W03_Ensemble_solution.ipynb

In [None]:
# function to evaluate
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve

def evaluate(y_test, y_pred, X_test, clf):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
      
def plot_learning_curves(model, X_train, y_train, X_val, y_val):
    """
    input:
        model:pipeline object
        X_train, y_train: training data
        X_val, y_val: test data
    """
    train_errors, val_errors = [], []
    for m in range(30, len(X_train)): #(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=1, label="training data")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=1, label="validation data")
    plt.legend(loc="upper right", fontsize=10)   
    plt.xlabel("Training set size", fontsize=10) 
    plt.ylabel("RMSE", fontsize=10)     
    # compare accuracy train versus test to access overfit 
    print(f'test  acc: {model.score(X_val, y_val)}')
    print(f'train acc: {model.score(X_train, y_train)}')

# Classifier Models

In this notebook I have tried almost all classification models avaialble in sklearn library. I wanted to explore which classifier works best and to play around with the data.

First the data has been split into train, test and validation sets. No normalisation has been done. Justification has already been given. The validation and test scores have been summed up in a summary table at the end of this notebook. However, for each of the scores the plots should always be considered to come to a conclusion as to which method performs best.


1. Logistic regression
2. Decision tree
3. SVM
4. Naive Bayes


#### Ensemble Learning
1. Random forest
2. Bagging with Decicion Tree classifier
3. Bagging with KNeighborsClassifier classifier
4. Boosting
5. Stacking
6. Gradient Boosting
7. Voting classifier

#### Neural network model
1. MLP Classifier

## Initialization

#### Select Features

In [None]:
#selecting features after 3rd column
Xs = parkinsons.columns[3:]
ys = parkinsons.columns[2:3]

# set X and y with the above defined columns
X = parkinsons[Xs]
y = parkinsons[ys]

In [None]:
# checking shaped of X and Y if they are same
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

#### Split the data

In [None]:
# Using the train_test_split function to split the data into train,test and validation. As my dataset
# is too small. I have decided to keep the test size to 25% and then 10% validation dataset.

from sklearn.model_selection import train_test_split, ShuffleSplit
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# also create a validation set from the train set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=0)
print(f'X train shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')
print(f'X validation shape: {X_val.shape}\n')
print(f'y train shape: {y_train.shape}')
print(f'y test shape: {y_test.shape}')
print(f'y validation shape: {y_val.shape}')

#### Check the distribution for all splits

In [None]:
#Checking distribution of train, test and valdiation data
from collections import Counter
plt.bar(Counter(y_train).keys(), Counter(y_train).values(), color='#2BE957')
plt.title('Train')
plt.show()

plt.bar(Counter(y_test).keys(), Counter(y_test).values(), color='#54DEE8')
plt.title('Test')
plt.show()


plt.bar(Counter(y_val).keys(), Counter(y_val).values())
plt.title('Validation')
plt.show()

In [None]:
# Plotting boxplots to check the distribution of Status over all features
for column in parkinsons.columns[3:]:   
    sns.set()    
    fig, ax = plt.subplots()
    sns.set(style="ticks")
    sns.boxplot(x = 'Status', y = column , data = parkinsons )


#### Defining a summary table to store Training and Testing accuracies of all models

In [None]:
summary = pd.DataFrame(columns = ['Name','Training accuracy', 'Testing accuracy'])

# 1. Logistic Regression

In [None]:
# initialize the model
lg = LogisticRegression(random_state=0) #max_iter=10000

# fit the model on the train data
lg.fit(X_train, y_train.values.ravel()) #np.ravel(y_train)

# calculate accuracy score on train data
lg_score_train = lg.score(X_train, y_train)
print(f'Accuracy of logistic regression on train set {lg_score_train:.2f}')

# calculate accuracy score on test data
lg_score_test = lg.score(X_test, y_test)
print(f'Accuracy of logistic regression on test set {lg_score_test:.2f}')
print("Confusion Matrix:")


# predict the values and evaluate with precision, recall and f1 
y_pred = lg.predict(X_test)
evaluate(y_test, y_pred, X_test, lg)

#Appending scores in summary table
lgsummary = {'Name':'Logistic Regression','Training accuracy':lg_score_train,'Testing accuracy':lg_score_test}
summary = summary.append(lgsummary, ignore_index=True)


In [None]:
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion matrix for Logistic Regression', y=1.1)
# plt.show()

Logistic Regression is used to examine the association of (categorical or continuous) independent variable(s) with one dependent variable. It is used for binary classification problem which has only two classes to predict. In our dataset, the 2 classes are having parkinsons or not. I suspected that this model should work but it is not the perfect model to be used.

Results:
Logistic regression model has a precision of 0.93 
Accuracy of 0.94
Recall: 1
f1-score: 0.96 

It has a recall value of 1 which will be perfect in an ideal scenario where every result retrieved by the model is correct.
But in my opinion a score of perfect 1 is not a good thing to acheive. It might be possible if we have a larger dataset, then 
our training and test data might acheieve different results. 

In [None]:
y_pred_proba = lg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#plotting the learning curve
plot_learning_curves(lg, X_train, y_train, X_test, y_test)

An RMSE score between 0.2 and 0.5 shows that the model can predict the data correctly. In our case both validation and traning
data does seem to acheive a good RMSE score but there is a lot of overfitting.
As can be seen in the plot the model starts to train well after about a size of 5. After around 25-40 the model does start to
converge but after 70 it starts to overfit the data.

In [None]:
# Dumping Logistic Regression Model
# joblib.dump(lg, 'lg_clf.pkl')

# 2.Decision Tree

In [None]:
# initialize the model
dt = DecisionTreeClassifier()
# fit the model on the train data
dt.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
dt_score_train = dt.score(X_train, y_train)
print(f'Accuracy of DecisionTree on train set {dt_score_train:.2f}')

# calculate accuracy score on test data
dt_score_test = dt.score(X_test, y_test)
print(f'Accuracy of DecisionTree on test set {dt_score_test:.2f}')
print("Confusion Matrix:")
y_pred = dt.predict(X_test)
evaluate(y_test, y_pred, X_test, dt)
#Appending scores in summary table
dtsummary = {'Name':'Decision Tree','Training accuracy':dt_score_train,'Testing accuracy':dt_score_test}
summary = summary.append(dtsummary, ignore_index=True)

In [None]:
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion matrix for Decision Tree', y=1.1)
# plt.show()

Decision trees are used for handling non-linear data sets effectively.Decision trees can be divided into two types;
categorical variable and continuous variable decision trees. It might be a good model to predict house prices based on certain
defined conditions or in civil engineering aspects. But in our case decision tree might not be a good idea as we are 
talking about disease prediction. The major drawback for decision trees is that if there is a slight change in the features of
data the results can change abruptly, and considering that it will be a really bad idea to predict any kind of disease/ diagnosis
based on decision tree. Decision trees are best suited for models where interpretation of data is lesss important than achieving
accuracy.

Our model achieves an accuracy of 0.9, precision of 0.97, a recall of 0.89 and f1-score of 0.93 which are all good in terms of 
statistical terms but as mentioned above a slight change and the model can converge on a negative side.


In [None]:
y_pred_proba = dt.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#plotting the learning curve
plot_learning_curves(dt, X_train, y_train, X_test, y_test)

As can be seen in above learning curve there is a huge difference between training and validation data. It starts to overfit
from the beginning and the RMSE score on training data is zero which is a bad indication that this model is not good for 
predicting parkinson's disease.

In [None]:
#This is an interesting visualization of the decision tree where the algorithm can draw a tree based on the algorithm
# I thought of giving it a try
from sklearn import tree
plt.figure(figsize=(20, 20))
tree.plot_tree(dt,filled=True,
              feature_names=parkinsons.columns[2:],
              class_names=['healthy', 'parkinsons'],
              fontsize=8)  
plt.savefig('parkinsons.png',format='png')

In the decision tree above it can be clearly seen that spread2 is once defined in class healthy and once in parkinsons which
in my opinion is wrong and can raise confusion

In [None]:
# Dumping Decision Tree Classifier
#joblib.dump(dt, 'dt_clf.pkl')

# SVM

In [None]:
#initialize the model
svm = SVC(kernel='rbf')
# fit the model on the train data
svm.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
svm_score_train = svm.score(X_train, y_train)
print(f'Accuracy of SVM on train set {svm_score_train:.3f}')

# calculate accuracy score on test data
svm_score_test = svm.score(X_test, y_test)
print(f'Accuracy of SVM on test set {svm_score_test:.3f}')
print("confusion Matrix:")
y_pred = svm.predict(X_test)
evaluate(y_test, y_pred, X_test, svm)
svmsummary = {'Name':'SVM','Training accuracy':svm_score_train,'Testing accuracy':svm_score_test}
summary = summary.append(svmsummary, ignore_index=True)

In [None]:
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion matrix for SVM', y=1.1)
# plt.show()

In [None]:
plot_learning_curves(svm, X_train, y_train, X_test, y_test)

Support Vector machines should be used when we have defined bouadaries between features. It does not perform well when the dataset
features are overlapping as in our case. Jitter, aplitude and shimmer are overlapping features and co-related. It works best 
for multiclass classification problems.

This model has a recall of 1.0 which is again not good and accuracy lower than Linear regression and decision tree model.
For our dataset this is not a good choice.

The larning curve also shows overfitting from the beginning

In [None]:
# Dumping SVM Classifier
#joblib.dump(svm, 'svm_clf.pkl')

# Naive Bayes

In [None]:
#initialize the model
gnb = GaussianNB()
# fit the model on the train data
gnb.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
gnb_score_train = gnb.score(X_train, y_train)
print(f'Accuracy of Naive Bayes on train set {gnb_score_train:.3f}')

# calculate accuracy score on test data
gnb_score_test = gnb.score(X_test, y_test)
print(f'Accuracy of Naive Bayes on test set {gnb_score_test:.3f}')
print("confusion Matrix:")
y_pred = gnb.predict(X_test)
evaluate(y_test, y_pred, X_test, gnb)
gnbsummary = {'Name':'Naive Bayes','Training accuracy':gnb_score_train,'Testing accuracy':gnb_score_test}
summary = summary.append(gnbsummary, ignore_index=True)

In [None]:
plot_learning_curves(gnb, X_train, y_train, X_test, y_test)

In [None]:
# cm = confusion_matrix(y_test, y_pred)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion matrix for Decision Tree', y=1.1)
# plt.show()

In [None]:
y_pred_proba = gnb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Naive Bayes works best with text classification. It is a probability calssifier and predics based on th epredictibilty
of an object. Initially the model starts to learn well but then it starts overfitting after just 6-7 training data. But it does
start to converge after sbout 60, so it might be possible that it will work on a dataset greater in size.

On the other hand it has a precision of 1.0 which is not good, and when we  see the accuracy it is 0.63 which is the worst 
performing model up until now.

In [None]:
# Dumping Naive Bayes Classifier
#joblib.dump(gnb, 'nb_clf.pkl')

# Ensemble learning

# Random Forest

In [None]:
#initialize the model
rf = RandomForestClassifier(n_estimators = 10)
#fit the model
rf.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
rf_score_train = rf.score(X_train, y_train)
print(f'Accuracy of Random Forest on train set {rf_score_train:.3f}')

# calculate accuracy score on test data
rf_score_test = rf.score(X_test, y_test)
print(f'Accuracy of Random Forest on test set {rf_score_test:.3f}')
print("confusion Matrix:")
y_pred = rf.predict(X_test)
evaluate(y_test, y_pred, X_test, rf)

rfsummary = {'Name':'Random Forest','Training accuracy':rf_score_train,'Testing accuracy':rf_score_test}
summary = summary.append(rfsummary, ignore_index=True)

In [None]:
plot_learning_curves(rf, X_train, y_train, X_test, y_test)

In [None]:
y_pred_proba = rf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

Random forest algorithm can be used for both classifications and regression task. It provides higher accuracy through
cross validation. But one most important detail is Random Forest is a data hungry algorith so using it in a small dataset
is a wrong choice. It creates multiple decision tree and based on that provides a result. As I have already explained
why decision tree is a bad idea, same goes for random forest.

This model has a precision of 1.0 which means it has got a lot of false negative. The accuracy of 0.93 is good but as can be
seen from the plot there is a lot of overfitting and it is not converging at all. So, for parkinsons prediction it is not at
all a good choice of model.

# Bagging with Decicion Tree classifier

In [None]:
#initialize the model
bg = BaggingClassifier(DecisionTreeClassifier(), max_features = 1.0, max_samples = 0.5) 
#fit the model
bg.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
bg_score_train = bg.score(X_train, y_train)
print(f'Accuracy of Bagging with DecisionTree on train set {bg_score_train:.3f}')

# calculate accuracy score on test data
bg_score_test = bg.score(X_test, y_test)
print(f'Accuracy of Bagging with DecisionTree on test set {bg_score_test:.3f}')
print("confusion Matrix:")

y_pred = bg.predict(X_test)
evaluate(y_test, y_pred, X_test, bg)
bgsummary = {'Name':'Bagging with Decicion Tree','Training accuracy':bg_score_train,'Testing accuracy':bg_score_test}
summary = summary.append(bgsummary, ignore_index=True)

In [None]:
plot_learning_curves(bg, X_train, y_train, X_test, y_test)

Bagging is a way to decrease the variance in the prediction by generating additional data for training from 
dataset using combinations with repetitions to produce multi-sets of the original data. In bagging we are using all features
in comparison with random forest where we are using a subset of features.

This model has an accuracy of 0.88 and a precision of 0.92, recall of 0.92 and an f1-score of 0.92 which is a good score overall.
The learning curve also shows better results where training data and validation data might converge but there is a lot 
of overfitting. But after around 10 records it might perform better.

# Bagging with KNeighbors Classifier

In [None]:
#initialize the model
bagging = BaggingClassifier(KNeighborsClassifier(),
                             max_samples=0.5, max_features=0.5)

In [None]:
#fit the model
bagging.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
bagging_score_train = bagging.score(X_train, y_train)
print(f'Accuracy of Bagging with KNeighbors Classifier on train set {bagging_score_train:.3f}')

# calculate accuracy score on test data
bagging_score_test = bagging.score(X_test, y_test)
print(f'Accuracy of Bagging with KNeighbors Classifier on test set {bagging_score_test:.3f}')
print('')
y_pred = bagging.predict(X_test)
evaluate(y_test, y_pred, X_test, bagging)

bgksummary = {'Name':'Bagging with K KNeighbors','Training accuracy':bagging_score_train,'Testing accuracy':bagging_score_test}
summary = summary.append(bgksummary, ignore_index=True)

In [None]:
plot_learning_curves(bagging, X_train, y_train, X_test, y_test)

Bagging with knn has better accuracy as compared to previous model i.e. 0.93 and a precision of 0.93. But yet again a recall of
1.0 which is not good. But in terms of the learning curve we can see overlap which is good and if we use a larger dataset
or use a smoothing algorith for the plots it might give better results.

# Boosting

In [None]:
#initialise the model
adb = AdaBoostClassifier(LogisticRegression(), n_estimators = 22,random_state=0, learning_rate = 1)
#fit the model
adb.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
adb_score_train = adb.score(X_train, y_train)
print(f'Accuracy of Ada Boosting on train set {adb_score_train:.2f}')

# calculate accuracy score on test data
adb_score_test = adb.score(X_test, y_test)
print(f'Accuracy of Ada Boosting on test set {adb_score_test:.2f}')
print('')
y_pred = adb.predict(X_test)
evaluate(y_test, y_pred, X_test, adb)
plot_learning_curves(adb, X_train, y_train, X_test, y_test)
adbsummary = {'Name':'Ada Boosting','Training accuracy':adb_score_train,'Testing accuracy':adb_score_test}
summary = summary.append(adbsummary, ignore_index=True)

For our dataset even if the model has reached an accuracy of 0.92 it does not perform well if we see the learning curve. 
After around 5 it starts to overfit. I have used a Logistic Regression classifier for boosting. It might be possible that 
if any other classifier is used it might give better results.

# Stacking

In [None]:
#initialise the model
estimators = [('dt', dt), ('lg',lg), ('svm', svm), ('gnb', gnb)]
#fit the model
sclf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
sclf.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
sclf_score_train = sclf.score(X_train, y_train)
print(f'Accuracy of Stacking on train set {sclf_score_train:.3f}')

# calculate accuracy score on test data
sclf_score_test = sclf.score(X_test, y_test)
print(f'Accuracy of Stacking on test set {sclf_score_test:.3f}')
print('')
y_pred = sclf.predict(X_test)
evaluate(y_test, y_pred, X_test, sclf)
plot_learning_curves(sclf, X_train, y_train, X_test, y_test)
sclfsummary = {'Name':'Stacking','Training accuracy':sclf_score_train,'Testing accuracy':sclf_score_test}
summary = summary.append(sclfsummary, ignore_index=True)

Stacking refers to a method of joining the machine learning models, similar to arranging a stack of plates.
Output of many models are combined and it is implemented to create a model. It is said that th eperformance is sometimes
best as compared to individual models. In our case we have an accuracy of 0.89 with a precision of 0.97, recall of 0.89 and 
f1-score of 0.93. All these scores point to a good model.

The learning curve plot also seem to converge and less overfit as compared to other models. After about 60 the training data
starts to overfit.

It does take a little bit longer time to execute.

# Gradient Boosting

In [None]:
#initialise the model
grad = GradientBoostingClassifier(random_state=0)
#fit the model
grad.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
grad_score_train = grad.score(X_train, y_train)
print(f'Accuracy of Gradient Boosting on train set {grad_score_train:.2f}')

# calculate accuracy score on test data
grad_score_test = grad.score(X_test, y_test)
print(f'Accuracy of Gradient Boosting on test set {grad_score_test:.2f}')
print('')
y_pred = grad.predict(X_test)
evaluate(y_test, y_pred, X_test, grad)
plot_learning_curves(grad, X_train, y_train, X_test, y_test)
gradsummary = {'Name':'Gradient Boosting','Training accuracy':grad_score_train,'Testing accuracy':grad_score_test}
summary = summary.append(gradsummary, ignore_index=True)

Gradient Boosting works well along with Decision trees. Although the model has an accuracy of 0.94 it has a precision of 1.0 
which means it has lot of false positives. Also, as mentioned earlier as Decision tree did not work well I assumed that gradient
boosting will give certainly bad results which is evident from the learning curve.


# Voting classifier

In [None]:
#initilize the model
evc = VotingClassifier(estimators = [('dt', dt), ('lg',lg), ('svm', svm)], voting = 'hard')
#fit the model
evc.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
evc_score_train = evc.score(X_train, y_train)
print(f'Accuracy of Voting classifier on train set {evc_score_train:.2f}')

# calculate accuracy score on test data
evc_score_test = evc.score(X_test, y_test)
print(f'Accuracy of Voting classifier on test set {evc_score_test:.2f}')
print('')
y_pred = evc.predict(X_test)
evaluate(y_test, y_pred, X_test, evc)
plot_learning_curves(evc, X_train, y_train, X_test, y_test)
evcsummary = {'Name':'Voting Classifier','Training accuracy':evc_score_train,'Testing accuracy':evc_score_test}
summary = summary.append(evcsummary, ignore_index=True)

 Voting classifier trains on an ensemble of multiple models and then predicts the values based on the highest votes. Our model has an accuracy of 0.94 with precision of 0.93 which is good but again a recall of 1. As can be seen it starts to overfit form the beginning and might not be a good idea to use for the parkinsons dataset.

# MLP classifier

In [None]:
#initialise the model
mlp =  MLPClassifier(alpha=1, max_iter=1000)

In [None]:
#fit the model
mlp.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
mlp_score_train = mlp.score(X_train, y_train)
print(f'Accuracy of MLP classifier on train set {mlp_score_train:.2f}')

# calculate accuracy score on test data
mlp_score_test = mlp.score(X_test, y_test)
print(f'Accuracy of MLP classifier  on test set {mlp_score_test:.2f}')
print('')
y_pred = mlp.predict(X_test)
evaluate(y_test, y_pred, X_test, mlp)
plot_learning_curves(mlp, X_train, y_train, X_test, y_test)
mlpsummary = {'Name':'MLP Classifier','Training accuracy':mlp_score_train,'Testing accuracy':mlp_score_test}
summary = summary.append(mlpsummary, ignore_index=True)

MLP classifier solves problem stochiatically. It is suitable for regression prediction problems where a value needs to be
predicted. I assumed that it will not work well and the plot shows why. It has an accuracy of 0.88 with a preciison of 0.92
, on the other hand the model is highly overfit.

# QuadraticDiscriminantAnalysis

In [None]:
#initilize the model
qda = QuadraticDiscriminantAnalysis()
#fit the model
qda.fit(X_train, y_train)

In [None]:
# calculate accuracy score on train data
qda_score_train = qda.score(X_train, y_train)
print(f'Accuracy of QuadraticDiscriminantAnalysis on train set {qda_score_train:.3f}')

# calculate accuracy score on test data
qda_score_test = qda.score(X_test, y_test)
print(f'Accuracy of QuadraticDiscriminantAnalysis on test set {qda_score_test:.3f}')
print('')
y_pred = qda.predict(X_test)
evaluate(y_test, y_pred, X_test, qda)
plot_learning_curves(qda, X_train, y_train, X_test, y_test)
qdasummary = {'Name':'QuadraticDiscriminantAnalysis','Training accuracy':qda_score_train,'Testing accuracy':qda_score_test}
summary = summary.append(qdasummary, ignore_index=True)

I tried this method just to check how it performs. The resources says it is good to find a non-linear boundary between classifiers
The parkinsons dataset does not have the same so I assume it is not a good model

# Summary of the methods

In [None]:
summary