In [96]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import Log_Reg

In [70]:
# loading the dataset
data = pd.read_csv("diabetes.csv")

In [71]:
# printing the first 10 rows
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [72]:
# number of rows and columns in the dataset
data.shape

(769, 9)

In [73]:
# getting the statistical measures of the data
data["Outcome"].value_counts()

Outcome
0    500
1    269
Name: count, dtype: int64

In [74]:
data.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.847584,141.189591,70.847584,22.081784,99.966543,35.14684,0.54939,37.133829


In [75]:
# separating the data and labels
features = data.drop(columns = "Outcome", axis=1)
target = data["Outcome"]

In [76]:
print(features)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   
768            0      123             77              0        1  36.3   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [77]:
print(target)

0      1
1      0
2      1
3      0
4      1
      ..
764    0
765    0
766    1
767    0
768    1
Name: Outcome, Length: 769, dtype: int64


In [78]:
# Scaling the data
scaler = StandardScaler()

scaler.fit(features)

standardized_data = scaler.transform(features)

print(standardized_data)

[[ 0.64130614  0.84878777  0.14919091 ...  0.20339465  0.46952637
   1.42135003]
 [-0.84323457 -1.12421007 -0.16116385 ... -0.68544558 -0.36432984
  -0.19277598]
 [ 1.23512243  1.94489768 -0.26461544 ... -1.10447026  0.60548119
  -0.10782198]
 ...
 [-0.84323457  0.15980439 -0.47151862 ... -0.24102547 -0.37037227
   1.16648803]
 [-0.84323457 -0.87367066  0.04573932 ... -0.20293231 -0.47309369
  -0.87240799]
 [-1.14014271  0.06585212  0.40781989 ...  0.54623302 -0.66343044
   1.84612003]]


In [79]:
features = standardized_data
target = data["Outcome"]

In [80]:
print(f"Features : {features}")
print("\n")
print(f"Target   : {target}")

Features : [[ 0.64130614  0.84878777  0.14919091 ...  0.20339465  0.46952637
   1.42135003]
 [-0.84323457 -1.12421007 -0.16116385 ... -0.68544558 -0.36432984
  -0.19277598]
 [ 1.23512243  1.94489768 -0.26461544 ... -1.10447026  0.60548119
  -0.10782198]
 ...
 [-0.84323457  0.15980439 -0.47151862 ... -0.24102547 -0.37037227
   1.16648803]
 [-0.84323457 -0.87367066  0.04573932 ... -0.20293231 -0.47309369
  -0.87240799]
 [-1.14014271  0.06585212  0.40781989 ...  0.54623302 -0.66343044
   1.84612003]]


Target   : 0      1
1      0
2      1
3      0
4      1
      ..
764    0
765    0
766    1
767    0
768    1
Name: Outcome, Length: 769, dtype: int64


In [81]:
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size = 0.2, random_state=2)

In [82]:
print(f"Features Shape : {features.shape}")
print("\n")
print(f"X_train Shape  : {X_train.shape}")
print("\n")
print(f"X_test Shape   : {X_test.shape}")

Features Shape : (769, 8)


X_train Shape  : (615, 8)


X_test Shape   : (154, 8)


In [90]:
# training the model
classifier = Log_Reg.Logistic_Regression(learning_rate=0.01, no_of_iterations=1000)

In [91]:
# fitting the data
classifier.fit(X_train, Y_train)

In [92]:
# model evaluation
# accuracy score of the training data
X_train_prediction = classifier.predict(X_train)

training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [93]:
print(f"Accuracy score of the train data : {training_data_accuracy}")

Accuracy score of the train data : 0.7626016260162601


In [94]:
# model evaluation
# accuracy score of the test data
X_test_prediction = classifier.predict(X_test)

testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [95]:
print(f"Accuracy score of the test data : {testing_data_accuracy}")

Accuracy score of the test data : 0.7922077922077922
