In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
df = pd.read_csv("/Users/mamadouourydiallo/Downloads/diabetes.csv")
# Check Data Shape
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [5]:
# Find the Correlation
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.13,0.14,-0.08,-0.07,0.02,-0.03,0.54,0.22
Glucose,0.13,1.0,0.15,0.06,0.33,0.22,0.14,0.26,0.47
BloodPressure,0.14,0.15,1.0,0.21,0.09,0.28,0.04,0.24,0.07
SkinThickness,-0.08,0.06,0.21,1.0,0.44,0.39,0.18,-0.11,0.07
Insulin,-0.07,0.33,0.09,0.44,1.0,0.2,0.19,-0.04,0.13
BMI,0.02,0.22,0.28,0.39,0.2,1.0,0.14,0.04,0.29
DiabetesPedigreeFunction,-0.03,0.14,0.04,0.18,0.19,0.14,1.0,0.03,0.17
Age,0.54,0.26,0.24,-0.11,-0.04,0.04,0.03,1.0,0.24
Outcome,0.22,0.47,0.07,0.07,0.13,0.29,0.17,0.24,1.0


In [6]:
# Define the Target(DV) and Features(IV)
X = df[df.columns[0:8]]
Y = df['Outcome']

In [7]:
# Spliting the Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 5)

In [8]:
#Normalisation
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [9]:
#Logistic Model
model = LogisticRegression(solver = 'liblinear', random_state = 0)
model.fit(X_train,Y_train)

LogisticRegression(random_state=0, solver='liblinear')

In [10]:
model.classes_

array([0, 1])

In [11]:
#model evaluation
print("Y_intercept", model.intercept_, "\n")
print("Slope", model.coef_)

Y_intercept [-0.85460645] 

Slope [[ 0.41825007  1.04068431 -0.32443059 -0.06443894 -0.06143852  0.69763117
   0.34231573  0.20161024]]


In [12]:
# model accuracy
model.score(X_test,Y_test)

0.7987012987012987

In [13]:
# classification report
print(classification_report(Y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       100
           1       0.74      0.65      0.69        54

    accuracy                           0.80       154
   macro avg       0.78      0.76      0.77       154
weighted avg       0.80      0.80      0.80       154



In [14]:
# SDG
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss = 'log', max_iter=1000, alpha= 0.0001)
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)

print("Y_intercept", clf.intercept_, "\n")
print("Slope", clf.coef_)
print("score", clf.score(X_test,Y_test))

Y_intercept [-1.07574511] 

Slope [[ 1.38410619  0.64778572 -0.54419045 -0.71320595 -0.2681374   1.47076827
   1.20834654  0.3826014 ]]
score 0.7142857142857143


In [15]:
# Grid Search
from sklearn.model_selection import GridSearchCV

parameters = {'loss':('hinge','log','modified_huber'),
              'penalty':('l2','l1','elasticnet'),
              'alpha':[1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1],
              'max_iter':[5000,10000,20000]}


model3 = SGDClassifier(random_state=0)

clf = GridSearchCV(model3, parameters)
clf.fit(X_train,Y_train)

GridSearchCV(estimator=SGDClassifier(random_state=0),
             param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'loss': ('hinge', 'log', 'modified_huber'),
                         'max_iter': [5000, 10000, 20000],
                         'penalty': ('l2', 'l1', 'elasticnet')})

In [16]:
clf.best_params_

{'alpha': 0.1,
 'loss': 'modified_huber',
 'max_iter': 5000,
 'penalty': 'elasticnet'}

In [17]:
clf.best_score_

0.7703718512594963

In [18]:
clf.best_estimator_.intercept_

array([-0.3031429])