In [1]:
from typing import List, Tuple

from hydra import compose, initialize
import pandas as pd
from sklearn.metrics import classification_report
import xgboost as xgb

from mlops_team_project.src.preprocess.preprocess import train_test_split_and_write, min_max_scale_and_write
from mlops_team_project.src.model.model import model

In [2]:
df = pd.read_csv("../data/raw/diabetes_data.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   70692 non-null  float64
 1   Sex                   70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   HeartDiseaseorAttack  70692 non-null  float64
 7   PhysActivity          70692 non-null  float64
 8   Fruits                70692 non-null  float64
 9   Veggies               70692 non-null  float64
 10  HvyAlcoholConsump     70692 non-null  float64
 11  GenHlth               70692 non-null  float64
 12  MentHlth              70692 non-null  float64
 13  PhysHlth              70692 non-null  float64
 14  DiffWalk              70692 non-null  float64
 15  Stroke             

Dataset has over 70000 entries and all values are non null and continuous

In [5]:
df.describe()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
count,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0,70692.0
mean,8.584055,0.456997,0.525703,0.975259,29.856985,0.475273,0.14781,0.703036,0.611795,0.788774,0.042721,2.837082,3.752037,5.810417,0.25273,0.062171,0.563458,0.5
std,2.852153,0.498151,0.499342,0.155336,7.113954,0.499392,0.354914,0.456924,0.487345,0.408181,0.202228,1.113565,8.155627,10.062261,0.434581,0.241468,0.49596,0.500004
min,1.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7.0,0.0,0.0,1.0,25.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9.0,0.0,1.0,1.0,29.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.5
75%,11.0,1.0,1.0,1.0,33.0,1.0,0.0,1.0,1.0,1.0,0.0,4.0,2.0,6.0,1.0,0.0,1.0,1.0
max,13.0,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,30.0,30.0,1.0,1.0,1.0,1.0


train_test_split_and_write will remove the class label from the dataset and split the data into training and testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split_and_write(df=df, write_path="../data/processed")

min_max_scale_and_write - normalizes the data

In [7]:
X_train_normalized, X_test_normalized = min_max_scale_and_write(X_train=X_train, X_test=X_test, write_path="../data/processed")

In [8]:
# run xgboost with baseline params
with initialize(version_base=None, config_path="../mlops_team_project/models/config"):
    hydra_params = compose(overrides=["+experiment=baseline"])
    print(hydra_params)

    model(
        X_train=X_train_normalized,
        X_test=X_test_normalized,
        y_train=y_train,
        y_test=y_test,
        hyperparameters=hydra_params.experiment,
    )

{'experiment': {'seed': 17, 'n_estimators': 100}}
cv scores = [0.74078331 0.75077358 0.74838653 0.74341291 0.74270557]
cv scores avg = 0.7452123794267391
Training: 0.7814970028115219, Testing: 0.746658179503501

              precision    recall  f1-score   support

non-diabetic       0.78      0.70      0.74      7198
    diabetic       0.72      0.79      0.75      6941

    accuracy                           0.75     14139
   macro avg       0.75      0.75      0.75     14139
weighted avg       0.75      0.75      0.75     14139



There's a pretty good fit here with 78% train and 74% testing

In [9]:
# run xgboost with exp1 params
with initialize(version_base=None, config_path="../mlops_team_project/models/config"):
    hydra_params = compose(overrides=["+experiment=exp1"])
    print(hydra_params)

    model(
        X_train=X_train_normalized,
        X_test=X_test_normalized,
        y_train=y_train,
        y_test=y_test,
        hyperparameters=hydra_params.experiment,
    )

{'experiment': {'seed': 17, 'n_estimators': 500}}
cv scores = [0.7314119  0.7351251  0.73256122 0.73218391 0.72758621]
cv scores avg = 0.731773667582264
Training: 0.836878680176118, Testing: 0.7343517929132187

              precision    recall  f1-score   support

non-diabetic       0.76      0.70      0.73      7198
    diabetic       0.71      0.77      0.74      6941

    accuracy                           0.73     14139
   macro avg       0.74      0.73      0.73     14139
weighted avg       0.74      0.73      0.73     14139



There's a little overfitting occuring here and the testing % goes down so our baseline model is actually better