In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb

In [2]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

In [3]:
train = pd.read_csv("../data/processed/train.csv")
valid = pd.read_csv("../data/processed/valid.csv")

In [4]:
len(train), len(valid)

(72109, 12018)

In [5]:
train.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

## prepare features

In [6]:
target_var = "diabetes"

In [7]:
cat_vars = [col for col in train.columns.values if train[col].dtype == "O" and col != target_var]

In [8]:
cat_vars

['gender', 'smoking_history']

In [9]:
num_vars = [col for col in train.columns.values if train[col].dtype in ["int64", "float64"] and col != target_var]

In [10]:
num_vars

['age',
 'hypertension',
 'heart_disease',
 'bmi',
 'HbA1c_level',
 'blood_glucose_level']

## prepare data

In [11]:
x_train = train.drop(target_var, axis=1)
y_train = train[target_var]

In [12]:
x_valid = valid.drop(target_var, axis=1)
y_valid = valid[target_var]

## Preprocessing 

### Preprocessing for tree-based models

In [13]:
dv = DictVectorizer()

In [14]:
train_dict = x_train.to_dict(orient="records")
val_dict = x_valid.to_dict(orient="records")

In [15]:
X_train = dv.fit_transform(train_dict)
X_valid = dv.transform(val_dict)

### Preprocessing for logistic regression

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [17]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_vars),
    ('cat', OneHotEncoder(), cat_vars)
])

In [18]:
pp_pipe = Pipeline(
    [
        ("preprocessor", preprocessor)
    ]

)

In [19]:
x_train_logreg = pp_pipe.fit_transform(x_train)
x_valid_logreg = pp_pipe.transform(x_valid)

## Modeling

### Logistic Regression

In [20]:
def model_eval(y_true, y_pred, y_pred_prob):
    auc = roc_auc_score(y_true, y_pred_prob)
    print(f"AUC for logisitc regrssion is: {auc:.3f}")
    f1 = f1_score(y_true, y_pred)
    print(f"f1 score is: {f1:.3f}")
    precision = precision_score(y_true, y_pred)
    print(f"Precision score is: {precision:.3f}")
    recall = recall_score(y_true, y_pred)
    print(f"Recall score is: {recall:.3f}")
    

In [21]:
log_reg = LogisticRegression()

In [22]:
log_reg.fit(x_train_logreg, y_train)

In [23]:
y_pred_prob = log_reg.predict_proba(x_valid_logreg)
y_pred_prob = y_pred_prob[:, 1]

In [24]:
y_pred = log_reg.predict(x_valid_logreg)

In [25]:
model_eval(y_valid, y_pred, y_pred_prob)

AUC for logisitc regrssion is: 0.963
f1 score is: 0.734
Precision score is: 0.873
Recall score is: 0.633


### Random Forest regressor

In [26]:
rf_cls = RandomForestClassifier()
rf_cls.fit(X_train, y_train)

In [27]:
y_pred = rf_cls.predict(X_valid)
y_pred_prob = rf_cls.predict_proba(X_valid)[:, 1]

In [28]:
model_eval(y_valid, y_pred, y_pred_prob)

AUC for logisitc regrssion is: 0.957
f1 score is: 0.800
Precision score is: 0.954
Recall score is: 0.689


### XGBoost

In [37]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [38]:
def f1_eval(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred = (y_pred > 0.5).astype(int)
    return 'f1', f1_score(y_true, y_pred)

In [39]:
params = {
    "objective": "binary:logistic",
}

In [40]:
num_rounds = 1000

In [41]:
xgb_cls = xgb.train(params, dtrain, num_boost_round=num_rounds, evals=[(dvalid, "validation")], maximize=True, feval=f1_eval, early_stopping_rounds=50)

[0]	validation-logloss:0.46354	validation-f1:0.79844
[1]	validation-logloss:0.33762	validation-f1:0.79844
[2]	validation-logloss:0.25930	validation-f1:0.79844
[3]	validation-logloss:0.20780	validation-f1:0.79844
[4]	validation-logloss:0.17286	validation-f1:0.79844
[5]	validation-logloss:0.14867	validation-f1:0.79844
[6]	validation-logloss:0.13157	validation-f1:0.79844
[7]	validation-logloss:0.11919	validation-f1:0.79844




[8]	validation-logloss:0.11038	validation-f1:0.79844
[9]	validation-logloss:0.10406	validation-f1:0.79844
[10]	validation-logloss:0.09932	validation-f1:0.79844
[11]	validation-logloss:0.09579	validation-f1:0.79844
[12]	validation-logloss:0.09285	validation-f1:0.79844
[13]	validation-logloss:0.09090	validation-f1:0.79844
[14]	validation-logloss:0.08897	validation-f1:0.79844
[15]	validation-logloss:0.08780	validation-f1:0.79844
[16]	validation-logloss:0.08691	validation-f1:0.79911
[17]	validation-logloss:0.08580	validation-f1:0.79978
[18]	validation-logloss:0.08527	validation-f1:0.79978
[19]	validation-logloss:0.08493	validation-f1:0.80044
[20]	validation-logloss:0.08428	validation-f1:0.80044
[21]	validation-logloss:0.08392	validation-f1:0.80111
[22]	validation-logloss:0.08369	validation-f1:0.80111
[23]	validation-logloss:0.08323	validation-f1:0.80111
[24]	validation-logloss:0.08306	validation-f1:0.80111
[25]	validation-logloss:0.08275	validation-f1:0.80066
[26]	validation-logloss:0.0827

In [42]:
y_pred_prob = xgb_cls.predict(dvalid)
y_pred = (y_pred_prob > 0.5).astype(int)

In [43]:
model_eval(y_valid, y_pred, y_pred_prob)

AUC for logisitc regrssion is: 0.978
f1 score is: 0.808
Precision score is: 0.957
Recall score is: 0.699
