# Importing the dependancies

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler #(Will be used to standardize the data to a common range)
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

# Data Collection and Analysis

The dataset can also be found on UCI Machine Learning repository on Kaggle as **PMA Diabetes Dataset**. The data is collected from women only

In [3]:
#Loading the dataset
diabetes_data = pd.read_csv("/content/diabetes.csv")

In [7]:
#Printing the first five rows of the DataFrame
diabetes_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
diabetes_data.shape

(768, 9)

In [9]:
#Getting the statistical measures of the data
diabetes_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [11]:
diabetes_data["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

**0 --> Non-Diabetic**

**1 --> Diabetic**

In [12]:
diabetes_data.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [13]:
#Separating X and Y

X = diabetes_data.drop(columns = "Outcome", axis = 1)
Y = diabetes_data["Outcome"]

#Data Standardization

Data standardization is used because the range of different features vary a lot which makes it difficult for the model to make predictions. We standardize the data so the readings are about the same range which helps the model to efficiently make predictions

In [14]:
scaler = StandardScaler()

In [15]:
scaler.fit(X)

In [16]:
standardized_data = scaler.transform(X)

In [17]:
print(standardized_data)

[[ 0.63994726  0.84832379  0.14964075 ...  0.20401277  0.46849198
   1.4259954 ]
 [-0.84488505 -1.12339636 -0.16054575 ... -0.68442195 -0.36506078
  -0.19067191]
 [ 1.23388019  1.94372388 -0.26394125 ... -1.10325546  0.60439732
  -0.10558415]
 ...
 [ 0.3429808   0.00330087  0.14964075 ... -0.73518964 -0.68519336
  -0.27575966]
 [-0.84488505  0.1597866  -0.47073225 ... -0.24020459 -0.37110101
   1.17073215]
 [-0.84488505 -0.8730192   0.04624525 ... -0.20212881 -0.47378505
  -0.87137393]]


In [18]:
standardized_data.shape

(768, 8)

In [19]:
X_standardized = standardized_data

#Model Training

In [20]:
#Splitting training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X_standardized, Y, test_size = 0.2,
  stratify = Y, random_state = 2)

In [22]:
#Model Selection
model = svm.SVC(kernel= 'linear') #SVC stands for "Support-Vector Classifier"

In [23]:
model.fit(X_train, Y_train)

#Model Evaluation

In [30]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print("Accuracy Score on training Data: ", training_data_accuracy*100)

Accuracy Score on training Data:  78.66449511400651


In [31]:
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy Score on testing Data: ",testing_data_accuracy*100)

Accuracy Score on testing Data:  77.27272727272727


#Making a Predictive System

In [35]:
input_data = (4,110,92,0,0,37.6,0.191,30)

#Coverting input_data into a numpy array
input_data_npArray = np.asarray(input_data)

#reshape the array as we are predicting for one instance
input_data_reshaped = input_data_npArray.reshape(1, -1)

#standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = model.predict(std_data)

if prediction== 0:
  print("Good News! You don't have Diabetes")
else:
  print("Bad News! You are diagnosed as Diabetic")

[[ 0.04601433 -0.34096773  1.18359575 -1.28821221 -0.69289057  0.71168975
  -0.84827977 -0.27575966]]
Good News! You don't have Diabetes


