In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
DataSet = pd.read_csv('diabetes.csv')

In [3]:
#now lets take look at few rows of the data set
DataSet.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#now lets get the total number of rows and columns of this data
DataSet.shape

(768, 9)

In [5]:
#lets check the statistical values in the data
DataSet.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
#lets check how many entries are there for diabetes positive(1) and negative(0)
DataSet['Outcome'].value_counts()
#we are using value_counts function

Outcome
0    500
1    268
Name: count, dtype: int64

In [7]:
#we will group data entries using Outcome
DataSet.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [8]:
#lets seperate data and labels
A=DataSet.drop(columns = 'Outcome', axis = 1)
Z=DataSet['Outcome']


In [9]:
#Lets STANDARDIZE the Data

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(A)

In [12]:
standardized_data= scaler.transform(A)

In [13]:
A = standardized_data
Z = DataSet['Outcome']

In [14]:
#LETS SPLIT OUR DATA INTO TRAINING DATA AND TEST DATA


In [15]:
A_train, A_test, Z_train, Z_test = train_test_split(A,Z, test_size = 0.2, stratify=Z,random_state=2)




In [16]:
#NOW LETS TRAIN OUR MODEL
#we are using SUPPORT VECTOR MACHINE methodology


In [17]:
analyser = svm.SVC(kernel = 'linear')

In [18]:
#training the analyser 
analyser.fit(A_train, Z_train)

In [19]:
#LETS EVALUATE OUR MODEL
#ACCURACY SCORES


In [20]:
A_train_Predictions = analyser.predict(A_train)

In [21]:
training_data_accuracy=accuracy_score(A_train_Predictions, Z_train)
#here we are comparing our learned machine with the original data to check the accuracy

In [22]:
final_score_percentage = training_data_accuracy*100

In [23]:
print('The final accuracy score of our traing data is : ',final_score_percentage,'%') 

The final accuracy score of our traing data is :  78.66449511400651 %


In [24]:
#now lets get the accuracy score on out test data
# =================================================== 

In [25]:
A_test_Predictions = analyser.predict(A_test)
test_data_accuracy=accuracy_score(A_test_Predictions, Z_test)

In [26]:
test_data_final_score_percentage=test_data_accuracy*100

In [27]:
print('The final accuracy score of our test data is : ',test_data_final_score_percentage,'%')



The final accuracy score of our test data is :  77.27272727272727 %


In [28]:
#FINALLY WE MAKE A PREDICTIVE SYSTEM TO GET DATA AND PREDICT THE OUTCOME
# =============================================================================

In [29]:
#we pick up any random data from our original data file and feed that to our model
#since we know the label(either 0 or 1) we can judge the accuracy of the prediction

In [34]:
input_data = (5,166,72,19,175,25.8,0.587,51)
#the output should be 0,,,,,since we know it from the original data, the label for this entry is 0

#change this input data to numpy array
input_array=np.asarray(input_data)

#reshape the array as we are predicting for one instance
input_data_reshaped = input_array.reshape(1, -1)

#now standardize the data
std_data = scaler.transform(input_data_reshaped)


prediction = analyser.predict(std_data)






In [35]:
print(prediction)

[1]


In [None]:
#since the prediction is 0
#our model is predicting accurately