In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [35]:
diabetes_dataset = pd.read_csv(r'diabetes.csv')

In [36]:
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [37]:
diabetes_dataset.shape

(768, 9)

In [38]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [39]:
diabetes_dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [40]:
diabetes_dataset.duplicated().sum()

0

In [41]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [42]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [48]:
diabetes_dataset['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

- 0 --> Non-Diabetic

- 1 --> Diabetic

In [43]:
X = diabetes_dataset.drop(columns = 'Outcome', axis = 1)
y = diabetes_dataset['Outcome']

In [44]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [45]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [61]:
standardized_data = scaler.transform(X)
print(standardized_data)

[[ 0.64067858  0.83421941  0.12781322 ...  0.17832154  0.46721679
   1.42611182]
 [-0.84152431 -1.15121561 -0.17913848 ... -0.69709361 -0.3555664
  -0.19227539]
 [ 1.23355973  1.93723887 -0.28145571 ... -1.10978932  0.60136622
  -0.10709711]
 ...
 [ 0.344238   -0.01668131  0.12781322 ... -0.74711733 -0.67156284
  -0.27745366]
 [-0.84152431  0.1408929  -0.48609018 ... -0.25938604 -0.36152859
   1.170577  ]
 [-0.84152431 -0.89909687  0.02549599 ... -0.22186825 -0.46288594
  -0.87370158]]


In [62]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [63]:
print(X)
print(Y)

[[ 0.64067858  0.83421941  0.12781322 ...  0.17832154  0.46721679
   1.42611182]
 [-0.84152431 -1.15121561 -0.17913848 ... -0.69709361 -0.3555664
  -0.19227539]
 [ 1.23355973  1.93723887 -0.28145571 ... -1.10978932  0.60136622
  -0.10709711]
 ...
 [ 0.344238   -0.01668131  0.12781322 ... -0.74711733 -0.67156284
  -0.27745366]
 [-0.84152431  0.1408929  -0.48609018 ... -0.25938604 -0.36152859
   1.170577  ]
 [-0.84152431 -0.89909687  0.02549599 ... -0.22186825 -0.46288594
  -0.87370158]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [64]:
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.2, stratify=y, random_state=2)

In [58]:
classifier = svm.SVC(kernel='linear')

In [65]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


In [66]:
#training the support vector Machine Classifier
classifier.fit(rescaledX, Y_train)

In [68]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7866449511400652


In [69]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7727272727272727


In [70]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[ 0.344238    1.40148656  0.12781322 -0.10834817  0.77530224 -0.79714106
   0.34797285  1.51129009]]
[1]
The person is diabetic




In [71]:
import pickle
filename = 'LR_trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [73]:
input_data = (3,126,88,41,235,39.3,0.704,27) # no

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[-0.24864315  0.1408929   0.94635109  1.2607511   1.28176796  0.89115958
   0.69676137 -0.53298848]]
[0]
The person is not diabetic


