In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
diabetes_data = pd.read_csv(diabetes.data',header=None)

In [3]:
diabetes_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
feature_names = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigree','Age','label']

In [5]:
diabetes_data.columns = feature_names

In [7]:
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
diabetes_data.shape

(768, 9)

#### Data Manipulation

In [9]:
diabetes_data.isnull().sum()

Pregnancies         0
Glucose             0
BloodPressure       0
SkinThickness       0
Insulin             0
BMI                 0
DiabetesPedigree    0
Age                 0
label               0
dtype: int64

In [10]:
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,label
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [11]:
diabetes = diabetes_data.drop(['Pregnancies', 'label'], axis = 1)

In [12]:
diabetes.replace(0, np.nan, inplace=True)

In [13]:
diabetes.isnull().sum().sort_values(ascending = False) 

Insulin             374
SkinThickness       227
BloodPressure        35
BMI                  11
Glucose               5
Age                   0
DiabetesPedigree      0
dtype: int64

In [14]:
diabetes['Insulin'].replace(np.nan, np.round(diabetes['Insulin'].mean()), inplace = True)

In [15]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

In [16]:
X_data = imputer.fit_transform(diabetes)

In [17]:
diabetes_df = pd.DataFrame(X_data, columns = diabetes.columns)

In [18]:
diabetes_df['Pregnancies'] = diabetes_data.Pregnancies
diabetes_df['label'] = diabetes_data.label

In [19]:
diabetes_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,Pregnancies,label
0,148.0,72.0,35.0,156.0,33.6,0.627,50.0,6,1
1,85.0,66.0,29.0,156.0,26.6,0.351,31.0,1,0
2,183.0,64.0,29.0,156.0,23.3,0.672,32.0,8,1
3,89.0,66.0,23.0,94.0,28.1,0.167,21.0,1,0
4,137.0,40.0,35.0,168.0,43.1,2.288,33.0,0,1


#### Features & Target

In [20]:
X_feature = diabetes_df.drop(['label'], axis = 1)
Y_target = diabetes_df['label']

In [21]:
X_feature.shape

(768, 8)

In [22]:
Y_target.shape

(768,)

In [23]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_feature,Y_target,random_state=1)

In [24]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()
logReg.fit(x_train,y_train)

In [25]:
y_pred = logReg.predict(x_test)

In [26]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

0.7760416666666666


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[108,  15],
       [ 28,  41]], dtype=int64)

In [28]:
print('actual:   ', y_test.values[0:30])
print('predicted:', y_pred[0:30])

actual:    [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1]
predicted: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0]
