In [1]:
!ls

dia_training.ipynb  diabetes.csv


In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('diabetes.csv')

In [5]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
y = df['Outcome']

In [8]:
target = 'Outcome'
o_target = df.columns[ df.columns != target ]
o_target

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [9]:
X  = df[ o_target ]
X.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=123, shuffle=True)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

In [18]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, log_loss

models =  {
    "Logistic Regression": LogisticRegression(C=100.0, penalty='l2', solver='liblinear', multi_class='ovr'),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(criterion='entropy', n_estimators=100),
    "Ada Boost Classifier": AdaBoostClassifier(),
    "XGB BOOST CLASSIFIER": XGBClassifier()
}


for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Name: {name}")
    print(f"Score for Trainig {model.score(X_train, y_train)}")
    print(f"Score for Testing {model.score(X_test, y_test)}")
    print(f"Accuracy score {accuracy_score(y_test, y_pred)}")
    print(f"F1 score {f1_score(y_test, y_pred)}")
    print(f"Precision score {precision_score(y_test, y_pred)}")
    print(f"Log loss score {log_loss(y_test, y_pred)}")
    print("#######\n")

Name: Logistic Regression
Score for Trainig 0.7746741154562383
Score for Testing 0.8008658008658008
Accuracy score 0.8008658008658008
F1 score 0.7124999999999999
Precision score 0.7916666666666666
Log loss score 7.177524051512507
#######

Name: Decision Tree Classifier
Score for Trainig 1.0
Score for Testing 0.7012987012987013
Accuracy score 0.7012987012987013
F1 score 0.6145251396648044
Precision score 0.6043956043956044
Log loss score 10.766286077268761
#######

Name: Random Forest Classifier
Score for Trainig 1.0
Score for Testing 0.7705627705627706
Accuracy score 0.7705627705627706
F1 score 0.7005649717514125
Precision score 0.6966292134831461
Log loss score 8.269755972394846
#######

Name: Ada Boost Classifier
Score for Trainig 0.8640595903165735
Score for Testing 0.7402597402597403
Accuracy score 0.7402597402597403
F1 score 0.6470588235294118
Precision score 0.6707317073170732
Log loss score 9.361987893277183
#######

Name: XGB BOOST CLASSIFIER
Score for Trainig 1.0
Score for Tes

In [20]:
lr = models["Logistic Regression"].fit(X_train, y_train)

te = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])
p_te = lr.predict(te)
p_te

array([1])

In [21]:
import pickle
data = {"model": lr}
with open('dia_training.pkl', 'wb') as file:
    pickle.dump(data, file)

In [22]:
!ls

dia_training.ipynb  dia_training.pkl  diabetes.csv


In [23]:
with open('dia_training.pkl', 'rb') as file:
    n_model = pickle.load(file)
loaded = n_model["model"]

In [24]:
te = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]])
p_t = loaded.predict(te)
p_t

array([1])