In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import kerastuner as kt



In [2]:
import warnings
warnings.filterwarnings('ignore')

# Utils

In [3]:
def plot_feature_importances(cols, feat_imps):
    fig = plt.figure(figsize = (15, 0.35*len(feat_imps)))
    feature_imp_df = pd.DataFrame(data=zip(cols, feat_imps), columns=["feature", "importance"]).sort_values(by="importance", ascending=False)
    plt.title('Feature importances', size=25, y=1.05)
    sns.barplot(data=feature_imp_df, x='importance', y='feature')
    plt.show()

# Loading Data

In [4]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e2/")

train = pd.read_csv(BASE_DIR / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_DIR / "test.csv").drop(columns="id")

# Preprocessing

In [5]:
df = pd.concat([train.drop(columns=["stroke"]), test], axis=0).reset_index(drop=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked


In [6]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,28.0,0,0,79.53,31.1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,33.0,0,0,78.44,23.9,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
2,42.0,0,0,103.0,40.3,1,0,0,0,1,...,0,1,0,0,1,0,1,0,0,0
3,56.0,0,0,64.87,28.8,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
4,24.0,0,0,73.36,28.8,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0


In [7]:
X = df.iloc[:-len(test), :]
test_new = df.iloc[-len(test):, :]
y = train.stroke

In [8]:
X.shape

(15304, 21)

# Tuning Keras - without feature engineering to set a baseline

In [9]:
def model_builder(hp):
    inputs = layers.Input(shape=(21,))
    
    hp_units_1 = hp.Int("units_1", min_value=256, max_value=1024, step=128)
    x = layers.Dense(hp_units_1, activation="relu")(inputs)
    x = layers.BatchNormalization()(x)
    hp_dropout_1 = hp.Float("dropout_1", min_value=0, max_value=0.8, step=0.1)
    x = layers.Dropout(hp_dropout_1)(x)
    
    hp_units_2 = hp.Int("units_2", min_value=128, max_value=512, step=64)
    x = layers.Dense(hp_units_2, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    hp_dropout_2 = hp.Float("dropout_2", min_value=0, max_value=0.8, step=0.1)
    x = layers.Dropout(hp_dropout_2)(x)
    
    hp_units_3 = hp.Int("units_3", min_value=64, max_value=256, step=32)
    x = layers.Dense(hp_units_3, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    hp_dropout_3 = hp.Float("dropout_3", min_value=0, max_value=0.8, step=0.1)
    x = layers.Dropout(hp_dropout_3)(x)
    
    hp_units_4 = hp.Int("units_4", min_value=16, max_value=128, step=16)
    x = layers.Dense(hp_units_4, activation="relu")(x)
    x = layers.BatchNormalization()(x)
    hp_dropout_4 = hp.Float("dropout_4", min_value=0, max_value=0.8, step=0.1)
    x = layers.Dropout(hp_dropout_4)(x)

    outputs = layers.Dense(1, activation="sigmoid")(x)

    keras_model = keras.Model(inputs=inputs, outputs=outputs)
    
    hp_learning_rate = hp.Float("learning_rate", min_value=1e-05, max_value=1e-1, sampling="log")
    hp_optimizer = hp.Choice("optimizer", ["rmsprop", "adam"])
    
    if hp_optimizer == "adam":
        optim = keras.optimizers.RMSprop(learning_rate=hp_learning_rate)
    else:
        optim = keras.optimizers.Adam(learning_rate=hp_learning_rate)
        
    keras_model.compile(optimizer=optim,
                       loss=keras.losses.binary_crossentropy,
                       metrics=[keras.metrics.AUC()])
    
    return keras_model

In [10]:
early_stopping = keras.callbacks.EarlyStopping(
                patience=5,
                min_delta=0.001,
                monitor="val_auc",
                restore_best_weights=True,
                )

# model_checkpoint = keras.callbacks.ModelCheckpoint(
#                         "./best_keras_model.hdf5",
#                         monitor= "val_loss",
#                         save_best_only = True
#                     )

In [11]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [14]:
tuner = kt.Hyperband(model_builder,
                    objective=kt.Objective("val_auc", direction="max"),
                    max_epochs=50,
                    directory="./",
                    project_name="tuning_keras",
                    overwrite=True)

In [15]:
tuner.search(X, y, epochs=50, validation_split=0.2, callbacks=[early_stopping])

Trial 90 Complete [00h 00m 34s]
val_auc: 0.790589451789856

Best val_auc So Far: 0.8956817984580994
Total elapsed time: 00h 28m 59s


In [18]:
best_hps_list = tuner.get_best_hyperparameters(num_trials=5)

In [20]:
for i, best_hps in enumerate(best_hps_list):
    print(f"{'-'*15} {i} {'-'*15}")
    print(best_hps.values)

--------------- 0 ---------------
{'units_1': 512, 'dropout_1': 0.30000000000000004, 'units_2': 448, 'dropout_2': 0.2, 'units_3': 128, 'dropout_3': 0.1, 'units_4': 32, 'dropout_4': 0.7000000000000001, 'learning_rate': 0.0034964601509452167, 'optimizer': 'rmsprop', 'tuner/epochs': 17, 'tuner/initial_epoch': 6, 'tuner/bracket': 2, 'tuner/round': 1, 'tuner/trial_id': '0055'}
--------------- 1 ---------------
{'units_1': 768, 'dropout_1': 0.1, 'units_2': 384, 'dropout_2': 0.30000000000000004, 'units_3': 128, 'dropout_3': 0.6000000000000001, 'units_4': 48, 'dropout_4': 0.5, 'learning_rate': 0.041466968969291085, 'optimizer': 'adam', 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0074'}
--------------- 2 ---------------
{'units_1': 896, 'dropout_1': 0.6000000000000001, 'units_2': 320, 'dropout_2': 0.2, 'units_3': 96, 'dropout_3': 0.6000000000000001, 'units_4': 80, 'dropout_4': 0.1, 'learning_rate': 0.0032006659651856403, 'optimizer': '

In [None]:
# def keras_cv(X, y, model):
#     kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337) # thumbs up if you're 1337 gang :D jk
    
#     cv_scores = []
    
#     for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#         X_train, X_val = X[train_idx], X[val_idx]
#         y_train, y_val = y[train_idx], y[val_idx]
        
#         history = model.fit(
#             X_train, y_train,
#             validation_data=(X_val, y_val),
#             batch_size=512,
#             epochs=50,
#             callbacks=[early_stopping, model_checkpoint],
#             )

#         y_pred = model.predict(X_val)[:, 0]
        
#         auc = roc_auc_score(y_val, y_pred)
        
#         print(f"Fold: {fold} \t auc: {auc}")
        
#         cv_scores.append(auc)
    
#     avg_auc = np.mean(cv_scores)
#     print(f"Avg AUC: {avg_auc}")

In [None]:
# keras_cv(X_scaled, y, keras_model)

In [None]:
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 0.00001, 0.5),
        'subsample': trial.suggest_loguniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.00001, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.00001, 1.0)
    }
    

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)

    cv_scores = np.empty(5)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=50,
            verbose=0,
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
    auc = np.mean(cv_scores)
    print(f"AVG CV AUC: \t {auc}")

    return auc

In [None]:
study = optuna.create_study(study_name="xgboost_tuning", direction="maximize")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=100)

In [None]:
study.best_value

In [None]:
study.best_params

# Feature Engineering
Copying from my other notebook at https://www.kaggle.com/khawajaabaidullah/ps-s3e2-feature-engineering/

## BMI Features

In [None]:
def bmi_level(bmi):
    if bmi < 18.5:
        return "underweight"
    elif bmi >= 18.5 and bmi < 25.0:
        return "healthy"
    elif bmi >= 25.0 and bmi < 30.0:
        return "overweight"
    elif bmi >= 30.0 and bmi < 35.0:
        return "obese_class1"
    elif bmi >= 35.0 and bmi < 40.0:
        return "obese_class2"
    elif bmi >= 40.0:
        return "obese_class3"

In [None]:
df_bmi = df.copy()
df_bmi["bmi_level"] = df.bmi.map(bmi_level)
df_bmi.head(3)

In [None]:
df_bmi = pd.get_dummies(df_bmi)
df_bmi.head()

In [None]:
X_bmi = df_bmi.iloc[:-len(test)]

In [None]:
study_bmi = optuna.create_study(study_name="xgboost_tuning_with_bmi_feats", direction="maximize")
func = lambda trial: objective(trial, X_bmi, y)
study_bmi.optimize(func, n_trials=100)

In [None]:
study_bmi.best_value

In [None]:
study_bmi.best_params

# AVG GLUCOSE FEATS

In [None]:
def diabetes_indicator(avg_glucose_level):
    if avg_glucose_level <= 99:
        return "normal"
    elif avg_glucose_level >= 100 and avg_glucose_level <= 125:
        return "prediabetic"
    elif avg_glucose_level <= 200:
        return "type1"
    else:
        return "type2"

df.avg_glucose_level.map(diabetes_indicator).value_counts()

In [None]:
df_diab = df.copy()
df_diab["diabetes"] = df.avg_glucose_level.map(diabetes_indicator)
df_diab.head(3)

In [None]:
df_diab = pd.get_dummies(df_diab)
df_diab.head(3)

In [None]:
X_diab = df.iloc[:-len(test), :]

In [None]:
study_diab = optuna.create_study(study_name="xgboost_tuning_with_diabetes_feats", direction="maximize")
func = lambda trial: objective(trial, X_diab, y)
study_diab.optimize(func, n_trials=200)

In [None]:
print("best_value: ", study_diab.best_value)
print("\nbest_trial: ", study_diab.best_trial)
print("\nbest_params: \n", study_diab.best_params)

In [None]:
# so the best trial came out to be 99 or the 100th (index starts from 0), i wonder if it'll get better if we set trials to 200?
# let's see

In [None]:
study_diab_2 = optuna.create_study(study_name="xgboost_tuning_with_diabetes_feats_200trials", direction="maximize")
func = lambda trial: objective(trial, X_diab, y)
study_diab_2.optimize(func, n_trials=200)

In [None]:
study_diab_2.best_value

In [None]:
study_diab_2.best_params

## Let's try combining all new featres i.e. Diabetes + BMI

In [None]:
df["bmi_level"] = df.bmi.map(bmi_level)
df["diabetes"] = df.avg_glucose_level.map(diabetes_indicator)
df.head(2)

In [None]:
df = pd.get_dummies(df)

In [None]:
X_final = df.iloc[:-len(test), :]

In [None]:
study_final = optuna.create_study(study_name="xgboost_tuning_with_diab_and_bmi", direction="maximize")
func = lambda trial: objective(trial, X_final, y)
study_final.optimize(func, n_trials=300)

In [None]:
study_final.best_value

In [None]:
study_final.best_params