**Importing Dependencies**

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,r2_score
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

**Data Collection And Analysis** 

In [3]:
## Loading the dataset 
df = pd.read_csv("PIMA_Indians_Diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
## Number of rows and columns in the dataset
df.shape

(768, 9)

In [5]:
## Getting the statistical data about the data
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
## Distribution of Target Variable Categories
print("\nDistribution of Target Variable Categories")
df['Outcome'].value_counts()


Distribution of Target Variable Categories


Outcome
0    500
1    268
Name: count, dtype: int64

In [7]:
## Percentage Of Distribution of Target Variable Categories
print("\nPercentage of Distribution of Target Variable Categories\n")
class_counts = df['Outcome'].value_counts(normalize=True) * 100
print((class_counts.astype('int64')))


Percentage of Distribution of Target Variable Categories

Outcome
0    65
1    34
Name: proportion, dtype: int64


- 0 --> Non-Diabetic People ,
- 1 --> Diabetic People

In [9]:
df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [10]:
## Separating the data and labels
X = df.drop('Outcome',axis=1)
y = df['Outcome']

**Data Standardization**

In [12]:
scaler = StandardScaler()

In [13]:
scaler.fit(X)

In [14]:
Standardized_Data = scaler.transform(X)

In [15]:
pd.DataFrame(Standardized_Data,columns=X.columns)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [16]:
X = Standardized_Data
y = df['Outcome']

**Train Test Split**

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [19]:
print(X.shape,X_train.shape,X_test.shape)

(768, 8) (614, 8) (154, 8)


**Training The Model**

In [21]:
classifier = svm.SVC(kernel='linear')

In [22]:
# Training The SVM Classifier
classifier.fit(X_train,y_train)

**Model Evaluation**

In [24]:
## Accuracy Score on Training Data
X_train_Prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_Prediction,y_train)

In [25]:
"Accuracy Score of the training data : {}".format(training_data_accuracy)

'Accuracy Score of the training data : 0.7866449511400652'

In [26]:
## Accuracy Score on Test Data
X_test_Prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_Prediction,y_test)

In [27]:
"Accuracy Score of the test data : {}".format(test_data_accuracy)

'Accuracy Score of the test data : 0.7727272727272727'

<p style='font-size:xx-large'><b>Making a Predictive System</b></p>

In [72]:
input_data = (6,148,72,35,0,33.6,0.627,50)

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Standardize the input data
std_data = scaler.transform(input_data_reshaped) 
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0]==0):
    print("This person doesn't have diabetic")
else:
    print("This person has diabetic")

[[ 0.63994726  0.84832379  0.14964075  0.90726993 -0.69289057  0.20401277
   0.46849198  1.4259954 ]]
[1]
This person has diabetic




**Saving the trained model**

In [64]:
import pickle

In [66]:
filename = "trained_model.sav"
pickle.dump(classifier,open(filename,'wb'))

In [68]:
# Loading the saved model
loaded_model = pickle.load(open("trained_model.sav","rb"))

In [74]:
input_data = (6,148,72,35,0,33.6,0.627,50)

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# Standardize the input data
std_data = scaler.transform(input_data_reshaped) 
print(std_data)

prediction = loaded_model.predict(std_data)
print(prediction)

if (prediction[0]==0):
    print("This person doesn't have diabetic")
else:
    print("This person has diabetic")

[[ 0.63994726  0.84832379  0.14964075  0.90726993 -0.69289057  0.20401277
   0.46849198  1.4259954 ]]
[1]
This person has diabetic


