In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Unnamed: 9
0,6,148,72,35,0,33.6,0.627,50,1,
1,1,85,66,29,0,26.6,0.351,31,0,
2,8,183,64,0,0,23.3,0.672,32,1,
3,1,89,66,23,94,28.1,0.167,21,0,
4,0,137,40,35,168,43.1,2.288,33,1,


Outcome is Label
Other 8 are Data
Unnamed: 9 is unwanted Column

In [4]:
data = data.drop('Unnamed: 9', axis = 1)

# **To Predict whether diabetes have or not**

In [5]:
data.shape

(768, 9)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
data.isnull()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
763,False,False,False,False,False,False,False,False,False
764,False,False,False,False,False,False,False,False,False
765,False,False,False,False,False,False,False,False,False
766,False,False,False,False,False,False,False,False,False


In [8]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Null Values in every column is zero, which means no null values

# Train and Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
x = data.iloc[:, data.columns != 'Outcome'] #data
y = data.iloc[:, data.columns == 'Outcome'] #label

In [11]:
print(x)
print(y)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [12]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2) #test data is 20%

#xtrain = train data
#ytrain = train label
#xtest = test data
#ytest = test label

In [13]:
xtrain.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
287,1,119,86,39,220,45.6,0.808,29
505,10,75,82,0,0,33.3,0.263,38
514,3,99,54,19,86,25.6,0.154,24
652,5,123,74,40,77,34.1,0.269,28
613,6,105,80,28,0,32.5,0.878,26


Index value is random, because train_test_split is splitting dataset randomly

In [14]:
ytrain.head()

Unnamed: 0,Outcome
287,1
505,0
514,0
652,0
613,0


But Index of xtrain and ytrain is same so no mismatch occurs

In [17]:
print("xtrain:", xtrain.shape)
print("ytrain:", ytrain.shape)
print("xtest:", xtest.shape)
print("ytest:", ytest.shape)

xtrain: (614, 8)
ytrain: (614, 1)
xtest: (154, 8)
ytest: (154, 1)


## Algorithm
**Random Forest**

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model = RandomForestClassifier()

In [20]:
model.fit(xtrain, ytrain.values.ravel()) #fit() is a function to train the algorithm

In [21]:
predicted_output = model.predict(xtest) #predict() is a function to test the algorithm
print(predicted_output)

[0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1
 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0
 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0
 0 0 1 0 0 1]


**Check Accuracy:**

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy = accuracy_score(predicted_output, ytest)
print("The accuracy score of the model: ", accuracy)

The accuracy score of the model:  0.7012987012987013


# **Thank You**