In [51]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [52]:
# read data file
data=pd.read_csv('/config/workspace/Dataset/diabetes.csv')

In [53]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [54]:
# checking null values
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [55]:
# replacing zero values with mean
data['BMI']=data['BMI'].replace(0, data['BMI'].mean())
data['BloodPressure']=data['BloodPressure'].replace(0, data['BloodPressure'].mean())
data['Glucose']=data['Glucose'].replace(0, data['Glucose'].mean())
data['Insulin']=data['Insulin'].replace(0, data['Insulin'].mean())
data['SkinThickness']=data['SkinThickness'].replace(0, data['SkinThickness'].mean())

In [56]:
# segregating dapendent and independent variables
X=data.drop(columns=['Outcome'])
y=data['Outcome']

In [57]:
# seperating train and test data
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=20, random_state=0)
X_test.shape, X_train.shape

((20, 8), (748, 8))

In [58]:
# Desicion Tree Model Training with Hyperparameter tuining
import warnings
warnings.filterwarnings('ignore')

In [59]:
parameters={
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random'],
    'max_depth':[1,2,3,4,5],
    'max_features':['auto', 'sqrt', 'log2']
}

In [60]:
from sklearn.model_selection import GridSearchCV
classifier=DecisionTreeClassifier()

In [61]:
clf=GridSearchCV(classifier, param_grid=parameters, cv=5, scoring='accuracy', verbose=3)
clf.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV 1/5] END criterion=gini, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 5/5] END criterion=gini, max_depth=1, max_features=auto, splitter=best;, score=nan total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=1, max_features=auto, splitter=random;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1, max_features=auto, splitter=random;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1, max_features=auto, splitter=random;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1, max_features=auto, splitter=random;, score

In [62]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'max_features': 'log2',
 'splitter': 'best'}

In [63]:
classifier2=DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='sqrt', splitter='random')

In [64]:
classifier2.fit(X_train, y_train)

lets see how model performs on test data

In [65]:
y_pred=classifier2.predict(X_test)

In [66]:
conf_mat=confusion_matrix(y_test, y_pred)
conf_mat

array([[11,  2],
       [ 5,  2]])

In [67]:
accuracy_score(y_test,y_pred)

0.65

In [68]:
import pickle
file=open('/config/workspace/Model/modelForPrediction.pkl', 'wb')
pickle.dump(classifier2, file)
file.close()

# Support Vector Classifier with Hyperparameter Tuning

In [44]:
# Defining parameter range
parm_grid={
    'C':[0.1, 1, 10, 100, 1000],
    'gamma':[1,0.1,0.01, 0.001, 0.0001],
    'kernel':['linear', 'rbf', 'polynomial']
}

In [45]:
grid=GridSearchCV(SVC(), param_grid=parm_grid, refit=True, cv=3, verbose=3, scoring='accuracy')

In [49]:
# EXECUTION TAKES MARE THAN 45 MINS