In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("Excels\\diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df = df.rename(columns={
    'Outcome' : 'Diabetes'
})

In [13]:
df['Diabetes'].value_counts()

Diabetes
0    500
1    268
Name: count, dtype: int64

In [14]:
df.groupby('Diabetes').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [9]:
df.shape

(768, 9)

In [10]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = df.drop(['Diabetes'],axis='columns')
y = df['Diabetes']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [23]:
X_train , X_test , y_train , y_test = train_test_split(X_scaled,y,test_size=0.2,random_state=20)

In [40]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [41]:
scores = cross_val_score(SVC(),X_scaled,y,cv=5)
scores.mean()

0.7708938120702827

In [42]:
scores = cross_val_score(DecisionTreeClassifier(),X_scaled,y,cv=5)
scores.mean()

0.7071725659960955

In [43]:
scores = cross_val_score(RandomForestClassifier(),X_scaled,y,cv=5)
scores.mean()

0.7644003055767762

In [44]:
model_params = {
    'SVC' :{
        'model' : SVC(),
        'params' : {
            'C' : [1,10,20,50],
            'kernel': ['rbf','linear'],
            'gamma' : ['auto','scale']
        }
    },
    'dec_tree' :{
        'model' : DecisionTreeClassifier(),
        'params' : {
        }
    },
    'rf_clf' :{
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [100,200,500]
        }
}
}

In [45]:
scores = []

for model_name,mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'],cv=5)
    clf.fit(X_train,y_train)
    scores.append({
        'model' : model_name,
        'best_score' : clf.best_score_,
        'best_params' : clf.best_params_
    })

df = pd.DataFrame(scores)
df

In [53]:
model = SVC(C=10,gamma='auto',kernel='linear').fit(X_train,y_train)
model.score(X_test,y_test)

0.7597402597402597

In [58]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy_score(y_pred,y_test)

0.7597402597402597

In [66]:
y_pred

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
      dtype=int64)

In [65]:
for i in range(len(y_pred)):
    if y_pred[i] == 1:  # Assuming 1 represents 'Diabetes' class
        print('Have Diabetes')
    else:
        print("Don't have Diabetes")


Don't have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Have Diabetes
Have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Don't have Diabetes
Don't have Diabetes
Have Diabetes
Don't have Diabetes
Have Diabetes
Don't ha