In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
data = pd.read_csv('diabetes.csv')
data.shape

(768, 9)

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
X = data.iloc[:, :-1]
y = data.iloc[:,-1]

In [8]:
print(X.shape)
print(y.shape)

(768, 8)
(768,)


In [9]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [10]:
X = sm.add_constant(X)
X.head()

Unnamed: 0,const,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1.0,6,148,72,35,0,33.6,0.627,50
1,1.0,1,85,66,29,0,26.6,0.351,31
2,1.0,8,183,64,0,0,23.3,0.672,32
3,1.0,1,89,66,23,94,28.1,0.167,21
4,1.0,0,137,40,35,168,43.1,2.288,33


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=51)
print(f"Train Data shape X: {X_train.shape}, y : {y_train.shape}")
print(f"Test Data shape X: {X_test.shape}, y : {y_test.shape}")

Train Data shape X: (614, 9), y : (614,)
Test Data shape X: (154, 9), y : (154,)


In [13]:
logistic_model = sm.Logit(y_train,X_train)
trained_logistic_model = logistic_model.fit()
print(trained_logistic_model.summary())

Optimization terminated successfully.
         Current function value: 0.472861
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      605
Method:                           MLE   Df Model:                            8
Date:                Mon, 15 Apr 2024   Pseudo R-squ.:                  0.2651
Time:                        11:39:47   Log-Likelihood:                -290.34
converged:                       True   LL-Null:                       -395.06
Covariance Type:            nonrobust   LLR p-value:                 6.490e-41
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -8.1883      0.789    -10.374      0.000      -9.735

In [14]:
predicted_probabilities = trained_logistic_model.predict(X_test)
print(predicted_probabilities)

737    0.124923
505    0.133688
296    0.212052
711    0.319903
329    0.159495
705    0.147763
23     0.312224
316    0.038056
446    0.083506
124    0.132264
72     0.816088
30     0.388341
481    0.182700
256    0.254276
395    0.473059
550    0.108067
590    0.835138
58     0.869551
688    0.246047
551    0.097965
681    0.802381
5      0.143238
464    0.379136
85     0.203360
74     0.056812
399    0.823076
317    0.696221
429    0.075549
111    0.621439
644    0.134616
611    0.732035
508    0.147362
653    0.199084
606    0.876850
564    0.089433
203    0.039795
255    0.237716
576    0.255451
568    0.489422
525    0.051590
66     0.212944
184    0.318752
169    0.126774
412    0.612590
183    0.050004
337    0.250397
29     0.263311
126    0.431793
532    0.264520
545    0.829675
548    0.537734
510    0.214256
571    0.100380
199    0.309004
313    0.239298
302    0.083678
331    0.072878
518    0.214291
584    0.237265
524    0.240476
281    0.540265
473    0.374253
439    0

In [17]:
threshold = 0.5

predictions = (predicted_probabilities > threshold).astype(int)
print(predictions)

737    0
505    0
296    0
711    0
329    0
705    0
23     0
316    0
446    0
124    0
72     1
30     0
481    0
256    0
395    0
550    0
590    1
58     1
688    0
551    0
681    1
5      0
464    0
85     0
74     0
399    1
317    1
429    0
111    1
644    0
611    1
508    0
653    0
606    1
564    0
203    0
255    0
576    0
568    0
525    0
66     0
184    0
169    0
412    1
183    0
337    0
29     0
126    0
532    0
545    1
548    1
510    0
571    0
199    0
313    0
302    0
331    0
518    0
584    0
524    0
281    1
473    0
439    0
254    0
503    0
312    0
749    1
325    0
418    0
687    0
744    1
224    0
629    0
595    1
444    0
215    1
554    0
746    1
699    1
492    0
596    0
328    0
266    1
406    0
363    1
613    0
354    0
630    0
238    1
609    0
534    0
367    0
165    0
250    0
118    0
182    0
479    0
267    1
284    0
585    0
541    0
16     0
677    0
553    0
147    0
445    1
427    1
627    0
14     1
557    0
32     0
1

In [18]:
print(f"Accuracy of the model: {accuracy_score(y_test,predictions)}")
print(confusion_matrix(y_test, predictions))

Accuracy of the model: 0.7922077922077922
[[90  7]
 [25 32]]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X.drop(['SkinThickness', 'Age'], axis=1),y,test_size=0.2, random_state=51)
print(f"Train Data shape X: {X_train.shape}, y : {y_train.shape}")
print(f"Test Data shape X: {X_test.shape}, y : {y_test.shape}")

Train Data shape X: (614, 7), y : (614,)
Test Data shape X: (154, 7), y : (154,)


In [21]:
logistic_model = sm.Logit(y_train,X_train)
trained_logistic_model = logistic_model.fit()
print(trained_logistic_model.summary())

Optimization terminated successfully.
         Current function value: 0.473272
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                Outcome   No. Observations:                  614
Model:                          Logit   Df Residuals:                      607
Method:                           MLE   Df Model:                            6
Date:                Mon, 15 Apr 2024   Pseudo R-squ.:                  0.2644
Time:                        11:55:52   Log-Likelihood:                -290.59
converged:                       True   LL-Null:                       -395.06
Covariance Type:            nonrobust   LLR p-value:                 2.359e-42
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                       -8.0864      0.771    -10.495      0.000      -9.597

In [22]:
predicted_probabilities = trained_logistic_model.predict(X_test)
print(predicted_probabilities)

737    0.121153
505    0.142805
296    0.211768
711    0.311441
329    0.154669
705    0.149769
23     0.325216
316    0.039193
446    0.085262
124    0.140833
72     0.833782
30     0.345971
481    0.176512
256    0.245788
395    0.488425
550    0.110512
590    0.832636
58     0.867211
688    0.256119
551    0.099685
681    0.801711
5      0.153066
464    0.414261
85     0.205847
74     0.057298
399    0.830038
317    0.715436
429    0.068671
111    0.619532
644    0.137070
611    0.731070
508    0.152615
653    0.207698
606    0.883875
564    0.093251
203    0.040459
255    0.239909
576    0.255499
568    0.483684
525    0.054332
66     0.199658
184    0.320482
169    0.133339
412    0.635913
183    0.053502
337    0.247938
29     0.272055
126    0.432808
532    0.249851
545    0.836517
548    0.494619
510    0.208618
571    0.112759
199    0.316314
313    0.250409
302    0.079547
331    0.074242
518    0.226382
584    0.230615
524    0.256719
281    0.547845
473    0.370989
439    0

In [23]:
threshold = 0.5

predictions = (predicted_probabilities > threshold).astype(int)
print(predictions)

737    0
505    0
296    0
711    0
329    0
705    0
23     0
316    0
446    0
124    0
72     1
30     0
481    0
256    0
395    0
550    0
590    1
58     1
688    0
551    0
681    1
5      0
464    0
85     0
74     0
399    1
317    1
429    0
111    1
644    0
611    1
508    0
653    0
606    1
564    0
203    0
255    0
576    0
568    0
525    0
66     0
184    0
169    0
412    1
183    0
337    0
29     0
126    0
532    0
545    1
548    0
510    0
571    0
199    0
313    0
302    0
331    0
518    0
584    0
524    0
281    1
473    0
439    0
254    0
503    0
312    0
749    1
325    0
418    0
687    0
744    1
224    0
629    0
595    1
444    0
215    1
554    0
746    1
699    1
492    0
596    0
328    0
266    1
406    0
363    1
613    0
354    0
630    0
238    1
609    0
534    0
367    0
165    0
250    0
118    0
182    0
479    0
267    1
284    0
585    0
541    0
16     0
677    0
553    0
147    0
445    1
427    1
627    0
14     1
557    0
32     0
1

In [24]:
print(f"Accuracy of the model: {accuracy_score(y_test,predictions)}")
print(confusion_matrix(y_test, predictions))

Accuracy of the model: 0.7922077922077922
[[91  6]
 [26 31]]
