In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [102]:
# Data collection and analysis

In [103]:
diabetes_dataset = pd.read_csv('diabetes.csv')

In [104]:
diabetes_dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [105]:
# no of rows adn columnas in this dataset
diabetes_dataset.shape

(768, 9)

In [106]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [107]:
diabetes_dataset.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [108]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [109]:
features = diabetes_dataset.drop('Outcome',axis=1)

In [110]:
target = diabetes_dataset[['Outcome']]

In [111]:
features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [112]:
target.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


## Data Standardization

In [113]:
std_scaler = StandardScaler()

In [114]:
features = std_scaler.fit_transform(features)

In [115]:
features

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

## Train Test Split

In [116]:
xtrain,xtest,ytrain,ytest = train_test_split(features,target,train_size=0.75,random_state=100)

In [117]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(576, 8)
(192, 8)
(576, 1)
(192, 1)


## Training the model

In [118]:
svc = SVC(C=1.0,kernel='rbf',gamma=0.2)

In [119]:
svc.fit(xtrain,ytrain)

## Model Evaluation

In [120]:
ypred = svc.predict(xtest)

In [121]:
accuracy = accuracy_score(ytest,ypred)

In [122]:
print('accuracy score of the testing data',accuracy)

accuracy score of the testing data 0.7135416666666666


## Grid Search

In [123]:
Params = {'C':[1,2,3,4,5,6,7,8],'kernel':['linear','poly','rbf','sigmoid'],'gamma':[0.1,0.2]}

In [124]:
grid_search = GridSearchCV(svc,Params)

In [125]:
grid_search.fit(xtrain,ytrain)

In [126]:
grid_search.best_params_

{'C': 6, 'gamma': 0.1, 'kernel': 'linear'}

### Model Training on the Basis of Grid Search parameters

In [127]:
svc = SVC(C=6,kernel='linear',gamma=0.1)

In [128]:
svc.fit(xtrain,ytrain)

In [129]:
ypred = svc.predict(xtest)

In [130]:
accuracy = accuracy_score(ytest,ypred)

In [131]:
print('accuracy score of the testing data',accuracy)

accuracy score of the testing data 0.7291666666666666


## Making a Predictive system

In [132]:
input_data = (6,148,72,35,0,33.6,0.627,50)

# changing the input data into numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array we are prediction g for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the inpute data
std_data = std_scaler.transform(input_data_reshaped)
std_data

array([[ 0.63994726,  0.84832379,  0.14964075,  0.90726993, -0.69289057,
         0.20401277,  0.46849198,  1.4259954 ]])

In [133]:
prediction =svc.predict(std_data)
print(prediction)
if prediction == 0:
    print('Non Diabitic')
else:
    print('Diabatic')

[1]
Diabatic
