In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split , cross_val_score , KFold
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/mpHarm88/datasets/master/diabetes.csv")

X = df.drop(columns = 'Outcome')
y = df.Outcome




models = []
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC(max_iter=1000)))
models.append(('LR', LogisticRegression(max_iter=1000)))
models.append(('DT', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))




X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df.Outcome, random_state=10)





names = []
scores = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)
tr_split = pd.DataFrame({'Name': names, 'Score': scores})




print(tr_split)


tr_split.set_index("Name",inplace=True)

print(    "\nBest Model to Use:" , tr_split.loc[tr_split['Score'] == tr_split["Score"].max()].index[0]    )

  Name     Score
0  KNN  0.708333
1  SVC  0.765625
2   LR  0.812500
3   DT  0.713542
4  GNB  0.770833
5   RF  0.796875
6   GB  0.791667

Best Model to Use: LR


In [3]:
# using cross_val


names = []
scores = []
for name, model in models:
    
    kfold = KFold(n_splits=10, random_state=10) 
    score = cross_val_score(model, X, y, cv=kfold, scoring='accuracy').mean()
    
    names.append(name)
    scores.append(score)
kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})



print(kf_cross_val)



kf_cross_val.set_index("Name",inplace=True)

print(    "\nBest Model to Use:" , kf_cross_val.loc[kf_cross_val['Score'] == kf_cross_val["Score"].max()].index[0]    )

  Name     Score
0  KNN  0.726555
1  SVC  0.760424
2   LR  0.776042
3   DT  0.704306
4  GNB  0.755178
5   RF  0.768182
6   GB  0.768199

Best Model to Use: LR


In [4]:
#ploting the scores (w/o cross_val)




# tr_split.reset_index(inplace=True)

# axis = sns.barplot(x = 'Name', y = 'Score', data = tr_split)
# axis.set(xlabel='Classifier', ylabel='Accuracy')
# for p in axis.patches:
#     height = p.get_height()
#     axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
# plt.show()

In [5]:
#ploting the scores (cross_val)




# kf_cross_val.reset_index(inplace=True)

# axis = sns.barplot(x = 'Name', y = 'Score', data = kf_cross_val)
# axis.set(xlabel='Classifier', ylabel='Accuracy')
# for p in axis.patches:
#     height = p.get_height()
#     axis.text(p.get_x() + p.get_width()/2, height + 0.005, '{:1.4f}'.format(height), ha="center") 
    
# plt.show()