In [99]:
import numpy as np

In [100]:
import pandas as pd

In [101]:
import seaborn as sns

In [102]:
from scipy.stats import zscore

In [103]:
import matplotlib.pyplot as plt
%matplotlib inline

In [104]:
mydata=pd.read_csv("diabetes.csv")

In [105]:
mydata.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [106]:
mydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [107]:
# we can see there are 768 rows and 9 columns in the dataset...out of which 7 cols are integers and 2 columns are float..

In [108]:
# Let us check whether dataset has any other non-numeric value in rows or columns

In [109]:
mydata[~mydata.applymap(np.isreal).all(1)]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [110]:
# since there is no output, we can conclude that there is no non-numeric values or junk characters 

In [111]:
 # After Analysing the dataset, we can see '0' values for the columns 'Glucose','BloodPressure','SkinThickness','Insulin','BMI'
# which is practically not possible for any human beings. So there is definitely missing values for these columns..
# Lets see the count of missing values for these columns.   

In [112]:
print((mydata[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]==0).sum())

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64


In [113]:
# Deleting the missing values may reduce the dataset count aggressively from 768 to 392...
# So, lets first update those '0' values to NaN values..


In [114]:
mydata[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] =mydata[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [115]:
mydata.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [116]:
mydata.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [117]:
# Then lets replace the missing values(NaN) of each columns by its mean value...

In [118]:
mydata.mean()

Pregnancies                   3.845052
Glucose                     121.686763
BloodPressure                72.405184
SkinThickness                29.153420
Insulin                     155.548223
BMI                          32.457464
DiabetesPedigreeFunction      0.471876
Age                          33.240885
Outcome                       0.348958
dtype: float64

In [119]:
mydata.fillna(mydata.mean(), inplace = True)

In [120]:
mydata.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [121]:
# now we can see there is no missing values in the dataset...

In [122]:
mydata.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


In [123]:
mydata.groupby('Outcome').size()

Outcome
0    500
1    268
dtype: int64

In [124]:
mydata.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,121.686763,30.435949,44.0,99.75,117.0,140.25,199.0
BloodPressure,768.0,72.405184,12.096346,24.0,64.0,72.202592,80.0,122.0
SkinThickness,768.0,29.15342,8.790942,7.0,25.0,29.15342,32.0,99.0
Insulin,768.0,155.548223,85.021108,14.0,121.5,155.548223,155.548223,846.0
BMI,768.0,32.457464,6.875151,18.2,27.5,32.4,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [125]:
# Pima Diabetes Project - Insight Summary:

# After performing the exploratory data analysis, we can able to find BloodPressure, BMI and Glucose were the attributes which
# controls the Outcome considerably whether the women is going to be diabetic or non diabetic..
# There were missing values in the dataset which was identified and updated by its attribute's mean value.
# There are many outliers in the dataset which are shown using Box plot graph..
# Mean & Median for BloodPressure, BMI & Glucose are so close..hence it shows the normal distribution...
# From Histogram and Pairplot, it is visible that BloodPressure shows very good normal distribution graph...
# Apart from BloodPressure, BMI & Glucose rest of the attributes are mostly right skewed or asymmetrical..
# Good correlation exists between Age vs Pregnancies, Glucose vs Outcome, BMI vs SkinThickness & Glucose vs Insulin..
# We can see considerable good Linear relationship between attributes Bloodpressure vs Glucose..

In [126]:
## Model Buliding

In [127]:
mydata.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [128]:
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score
target = mydata["Outcome"]
#print (target)
#features = mydata.drop([ "Pregnancies", "SkinThickness", "Insulin", "DiabetesPedigreeFunction", "Age", "Outcome"], axis=1)
#print (features)
features = mydata.drop(["Outcome"], axis=1)

In [129]:
# convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# It is always adviced to scale numeric attributes in models that calculate distances.
features_df_z = features.apply(zscore)  # convert all attributes to Z scale 
features_df_z.describe()
#print(features_df_z)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,2.5442610000000002e-17,-3.301757e-16,6.966722e-16,6.866252e-16,-2.352033e-16,3.090699e-16,2.462585e-16,1.8576e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-2.554131,-4.004245,-2.52167,-1.665945,-2.075119,-1.189553,-1.041549
25%,-0.8448851,-0.7212214,-0.695306,-0.4727737,-0.4007289,-0.7215397,-0.6889685,-0.7862862
50%,-0.2509521,-0.1540881,-0.01675912,8.087936e-16,-3.345079e-16,-0.008363615,-0.3001282,-0.3608474
75%,0.6399473,0.610309,0.6282695,0.3240194,-3.345079e-16,0.6029301,0.4662269,0.6602056
max,3.906578,2.54185,4.102655,7.950467,8.126238,5.042087,5.883565,4.063716


In [130]:
X = np.array(features_df_z)
X.shape

(768, 8)

In [131]:
# store the bc_labels data into a separate np array
y = np.array(target)
y.shape

(768,)

In [132]:
# Split X and y into training and test set in 75:25 ratio
#X_train, X_test, y_train, y_test = train_test_split(features_df_z,target, test_size = 0.25, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [133]:
#feature Scaling  
#from sklearn.preprocessing import StandardScaler    
#st_x= StandardScaler()  
#X_train= st_x.fit_transform(X_train)    
#X_test= st_x.transform(X_test)    

from sklearn.svm import SVC
# Building a Support Vector Machine on train data
#svc_model = SVC(C=2.0, kernel='linear', degree=3
svc_model = SVC(C= 0.5, kernel='linear', degree=5)
svc_model.fit(X_train, y_train)  
prediction = svc_model .predict(X_test)
# check the accuracy on the training set
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))

print("Accuracy:",accuracy_score(y_test, prediction))
print("Precision:",precision_score(y_test, prediction))
print("Recall:",recall_score(y_test, prediction))

0.7673611111111112
0.7760416666666666
Accuracy: 0.7760416666666666
Precision: 0.7407407407407407
Recall: 0.5797101449275363


In [134]:
print("Confusion Matrix:\n",confusion_matrix(y_test,prediction))

Confusion Matrix:
 [[109  14]
 [ 29  40]]


In [135]:
# Building a Support Vector Machine on train data
svc_model = SVC(kernel='rbf')
svc_model.fit(X_train, y_train)
prediction = svc_model.predict(X_test)
  
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Accuracy:",accuracy_score(y_test, prediction))
print("Precision:",precision_score(y_test, prediction))
print("Recall:",recall_score(y_test, prediction))

0.8194444444444444
0.7864583333333334
Accuracy: 0.7864583333333334
Precision: 0.7916666666666666
Recall: 0.5507246376811594


In [136]:
#Building a Support Vector Machine on train data(changing the kernel)
svc_model  = SVC(kernel='poly')
svc_model.fit(X_train, y_train)
  
prediction = svc_model.predict(X_test)
  
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Accuracy:",accuracy_score(y_test, prediction))
print("Precision:",precision_score(y_test, prediction))
print("Recall:",recall_score(y_test, prediction))

0.78125
0.7239583333333334
Accuracy: 0.7239583333333334
Precision: 0.7352941176470589
Recall: 0.36231884057971014


In [137]:
#Building a Support Vector Machine on train data(changing the kernel)
svc_model  = SVC(kernel='sigmoid')
svc_model.fit(X_train, y_train)
  
prediction = svc_model.predict(X_test)
  
print(svc_model.score(X_train, y_train))
print(svc_model.score(X_test, y_test))
print("Accuracy:",accuracy_score(y_test, prediction))
print("Precision:",precision_score(y_test, prediction))
print("Recall:",recall_score(y_test, prediction))

0.6666666666666666
0.7239583333333334
Accuracy: 0.7239583333333334
Precision: 0.6052631578947368
Recall: 0.6666666666666666


# if kernel='linear' we are getting highest accuracy compared to other kernal types