In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [22]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [23]:
print(data.shape)
print('-'*10)
print(data.info())
print('-'*10)
print(data.isnull().sum())

(768, 9)
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None
----------
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction

In [24]:
data.Outcome.value_counts()
#0--->Non-Diabetic
#1---->Diabetic

0    500
1    268
Name: Outcome, dtype: int64

In [25]:
data.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [26]:
X = data.drop('Outcome',axis=1)
y = data['Outcome']

In [27]:
scaler = StandardScaler()

In [28]:
X_scaled = scaler.fit_transform(X)

In [29]:
X_scaled

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  1.4259954 ],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078, -0.19067191],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732, -0.10558415],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336, -0.27575966],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  1.17073215],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505, -0.87137393]])

In [30]:
X_tr,X_te,y_tr,y_te = train_test_split(X_scaled,y,test_size=0.2, stratify=y,random_state=20)

In [31]:
print("Original Shape :",data.shape)
print(X_tr.shape,'\n',y_tr.shape,'\n',X_te.shape,'\n',y_te.shape)

Original Shape : (768, 9)
(614, 8) 
 (614,) 
 (154, 8) 
 (154,)


In [32]:
classifier = svm.SVC(kernel="linear")

In [33]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    # Create an SVM model with the specified kernel
    model = svm.SVC(kernel=kernel)

    # Train the model
    model.fit(X_tr, y_tr)

    # Make predictions
    y_pred = model.predict(X_te)

In [34]:
classifier.fit(X_tr,y_tr)

SVC(kernel='linear')

In [35]:
train_prediction = model.predict(X_tr)
train_data_accuracy = accuracy_score(train_prediction,y_tr)
print("Accuracy on training data : ",train_data_accuracy)

Accuracy on training data :  0.6921824104234527


In [36]:
test_prediction = model.predict(X_te)
test_data_accuracy = accuracy_score(test_prediction,y_te)
print("Accuracy on testing data : ",test_data_accuracy)

Accuracy on testing data :  0.7272727272727273


In [37]:
def diabetic_prediction(inp):
    inp_array = np.asarray(inp)
    input_data_reshaped = inp_array.reshape(1,-1)
    prediction = classifier.predict(input_data_reshaped)
    if (prediction[0]== 0):
        print('The Person is Non-Diabetic')
    else:
        print('The Person is Diabetic')

In [38]:
data.sample(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
583,8,100,76,0,0,38.7,0.19,42,0
766,1,126,60,0,0,30.1,0.349,47,1


In [39]:
D =(6,117,96,0,0,28.7,0.157,30)
print(diabetic_prediction(D))

The Person is Diabetic
None


In [41]:
import pickle 
file_name = 'trained_diabetic_model.sav'
pickle.dump(classifier,open(file_name,'wb'))

In [42]:
loaded_model = pickle.load(open('trained_diabetic_model.sav','rb'))

In [43]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')