In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

import xgboost as xgb

In [2]:
train = pd.read_parquet("../data/processed/train.parquet")
valid = pd.read_parquet("../data/processed/valid.parquet")

In [3]:
len(train), len(valid)

(75000, 12500)

In [4]:
train.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

## prepare features

In [5]:
target_var = "diabetes"

In [6]:
cat_vars = [col for col in train.columns.values if train[col].dtype == "O" and col != target_var]

In [7]:
cat_vars

['gender', 'smoking_history']

In [8]:
num_vars = [col for col in train.columns.values if train[col].dtype in ["int64", "float64"] and col != target_var]

In [9]:
num_vars

['age',
 'hypertension',
 'heart_disease',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level']

## prepare data

In [10]:
x_train = train.drop(target_var, axis=1)
y_train = train[target_var]

In [11]:
x_valid = valid.drop(target_var, axis=1)
y_valid = valid[target_var]

## Preprocessing 

### Preprocessing for tree-based models

In [12]:
dv = DictVectorizer()

In [13]:
train_dict = x_train.to_dict(orient="records")
val_dict = x_valid.to_dict(orient="records")

In [14]:
X_train = dv.fit_transform(train_dict)
X_valid = dv.transform(val_dict)

### Preprocessing for logistic regression

In [15]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_vars),
    ('cat', OneHotEncoder(), cat_vars)
])

In [16]:
pp_pipe = Pipeline(
    [
        ("preprocessor", preprocessor)
    ]

)

In [17]:
x_train_logreg = pp_pipe.fit_transform(x_train)
x_valid_logreg = pp_pipe.transform(x_valid)

## Modeling

### Logistic Regression

In [18]:
def model_eval(y_true, y_pred, y_pred_prob):
    auc = roc_auc_score(y_true, y_pred_prob)
    print(f"AUC  is: {auc:.3f}")
    f1 = f1_score(y_true, y_pred)
    print(f"f1 score is: {f1:.3f}")
    precision = precision_score(y_true, y_pred)
    print(f"Precision score is: {precision:.3f}")
    recall = recall_score(y_true, y_pred)
    print(f"Recall score is: {recall:.3f}")
    

In [19]:
log_reg = LogisticRegression()

In [20]:
log_reg.fit(x_train_logreg, y_train)

In [21]:
y_pred_prob = log_reg.predict_proba(x_valid_logreg)
y_pred_prob = y_pred_prob[:, 1]

In [22]:
y_pred = log_reg.predict(x_valid_logreg)

In [23]:
model_eval(y_valid, y_pred, y_pred_prob)

AUC  is: 0.963
f1 score is: 0.725
Precision score is: 0.863
Recall score is: 0.626


### Random Forest regressor

In [24]:
rf_cls = RandomForestClassifier()
rf_cls.fit(X_train, y_train)

In [25]:
y_pred = rf_cls.predict(X_valid)
y_pred_prob = rf_cls.predict_proba(X_valid)[:, 1]

In [26]:
model_eval(y_valid, y_pred, y_pred_prob)

AUC  is: 0.966
f1 score is: 0.796
Precision score is: 0.958
Recall score is: 0.682


### XGBoost

In [28]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [29]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred = (y_pred > 0.5).astype(int)
    return 'f1', f1_score(y_true, y_pred)

In [30]:
params = {
    "objective": "binary:logistic",
}

In [31]:
num_rounds = 1000

In [32]:
xgb_cls = xgb.train(params, dtrain, num_boost_round=num_rounds,  evals=[(dvalid, "validation")], maximize=True, feval=f1_eval, early_stopping_rounds=50, verbose_eval=1000)

[0]	validation-logloss:0.46274	validation-f1:0.79936




[96]	validation-logloss:0.08334	validation-f1:0.80147


In [33]:
y_pred_prob = xgb_cls.predict(dvalid)
y_pred = (y_pred_prob > 0.5).astype(int)

In [34]:
model_eval(y_valid, y_pred, y_pred_prob)

AUC  is: 0.980
f1 score is: 0.799
Precision score is: 0.962
Recall score is: 0.684
