In [1]:
# Importing libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing diabetes dataset
diabetes_df = pd.read_csv('diabetes_datasets/pima_diabetes.csv')
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# Seperate the features, X,  from the target variable, y
y = diabetes_df['Outcome']
X = diabetes_df.drop(columns='Outcome')

In [4]:
# Preview the features
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [5]:
#preview the few entries for the target variable
y[:5]

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [6]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
from sklearn.model_selection import train_test_split
# Scale the data using StandardScalar
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Statistical summary
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0


In [13]:
X_train.shape

(576, 8)

In [None]:
# Feature scaling using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
outcome_scaled_df = sc.fit_transform(outcome_df)

In [None]:
outcome_scaled_df = pd.DataFrame(outcome_scaled_df)

In [None]:
# Selecting features - [Glucose, Insulin, BMI, Age]
X = outcome_scaled_df.iloc[:, [1, 4, 5, 7]].values
Y = outcome_scaled_df.iloc[:, 8].values

In [None]:
# Splitting X and Y
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42, stratify = outcome_df['Outcome'])


In [None]:
# Checking dimensions
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

In [None]:
# Random forest Algorithm
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 11, criterion = 'entropy', random_state = 42)
random_forest.fit(X_train, Y_train)

In [None]:
predict_random_forest = random_forest.predict(X_test)
predict_random_forest

In [None]:
# Evaluating using accuracy_score metric
from sklearn.metrics import accuracy_score
random_forest_accuracy = accuracy_score(Y_test, predict_random_forest)

In [None]:
print("Random Forest: " + str(random_forest_accuracy * 100))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, predict_random_forest)
cm

In [None]:
# Heatmap of Confusion matrix
sns.heatmap(pd.DataFrame(cm), annot=True)

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(Y_test, predict_random_forest))