# TODO:
* Implement stacking (ensembling technique)
* Use xgboost, catboost, lightgbm and keras and logistic regression as base models
* Use linear regression with regularization (fine tune the regularization parameter with optuna) and tackle the task as regression with target values either being 0 or 1 and feature values of meta data being the probabilities predicted by our base models

In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

# Loading Data

In [3]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e2/")

train = pd.read_csv(BASE_DIR / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_DIR / "test.csv").drop(columns="id")

# Preprocessing

In [4]:
df = pd.concat([train.drop(columns=["stroke"]), test], axis=0).reset_index(drop=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked


# Some feature engineering

In [5]:
def diabetes_indicator(avg_glucose_level):
    if avg_glucose_level <= 99:
        return "normal"
    elif avg_glucose_level >= 100 and avg_glucose_level <= 125:
        return "prediabetic"
    elif avg_glucose_level <= 200:
        return "type1"
    else:
        return "type2"

df.avg_glucose_level.map(diabetes_indicator).value_counts()

normal         20098
prediabetic     4002
type1            916
type2            492
Name: avg_glucose_level, dtype: int64

In [6]:
df["diabetes"] = df.avg_glucose_level.map(diabetes_indicator)
df.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,diabetes
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,normal
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,normal
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,prediabetic


# Encoding - One hot

In [7]:
df = pd.get_dummies(df)
df.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,diabetes_normal,diabetes_prediabetic,diabetes_type1,diabetes_type2
0,28.0,0,0,79.53,31.1,0,1,0,0,1,...,0,1,0,0,1,0,1,0,0,0
1,33.0,0,0,78.44,23.9,0,1,0,0,1,...,1,0,0,1,0,0,1,0,0,0
2,42.0,0,0,103.0,40.3,1,0,0,0,1,...,1,0,1,0,0,0,0,1,0,0


# Preprocessing v2

In [8]:
X = df.iloc[:-len(test), :]
test_new = df.iloc[-len(test):, :]
y = train.stroke

In [32]:
type(X)

pandas.core.frame.DataFrame

# Before we ensebmle, let's first finetune individual models and to get the optimized parameters for them

In [9]:
def cross_validate(X, y, model):
    kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)
    
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                
        # training
        model.fit(X_train, y_train, verbose=0)

        # predicting
        y_pred = model.predict_proba(X_val)[:, 1]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold} \t auc: {auc}")
        
        cv_scores.append(auc)
    
    avg_auc = np.mean(cv_scores)
    print(f"Avg AUC: {avg_auc}")

## Finetuning Logsitic Regression

In [48]:
def objective(trial, X, y):
    params = {
        "solver": trial.suggest_categorical("solver", ["liblinear", "newton-cg", "saga", "lbfgs"]),
        "C": trial.suggest_loguniform("C", 0.01, 100.0),
        "tol": trial.suggest_loguniform("tol", 1e-6, 1e-2),
        "max_iter": trial.suggest_int("max_iter", 100, 1000),
    }
        
    if params["solver"] in ["newton-cg", "lbfgs"]:
        params["penalty"] = "l2"
        
    elif params["solver"] == "liblinear":
        params["penalty"] =  trial.suggest_categorical("penalty_liblinear", ["l1","l2"])
    
    elif params["solver"] == "saga":
        params["penalty"] =  trial.suggest_categorical("penalty_saga", ["l1","l2","elasticnet"])
        
        # saga also needs anothe parameter l1_ratio which ranges in 0-1
        params["l1_ratio"] = trial.suggest_loguniform("l1_ratio", 0.00001, 1)
        
        # saga also likes it's features to be scaled to converge so let's scale our data
        sc = StandardScaler()
        X = sc.fit_transform(X)
        
    # lets cross validate
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)
    
    cv_scores = []
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        # since we scale our dataset also, that converts it into a numpy ndarray
        # and hence it won't have iloc func that pandas dataframe has.
        # so let's check for the datatype of X first and then slice
        if type(X) == pd.core.frame.DataFrame:
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
        else:
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
        
        model = LogisticRegression(**params)       
        model.fit(X_train, y_train)
        
        y_pred = model.predict_proba(X_test)[:, 1]
        cv_scores.append(roc_auc_score(y_test, y_pred))
    
    auc = np.mean(cv_scores)
    return auc

In [None]:
lr_study = optuna.create_study(direction="maximize", study_name="linear regression tuning")
func = lambda trial: objective(trial, X, y)
lr_study.optimize(func, n_trials=100, show_progress_bar=True)

In [51]:
lr_study.best_value

0.8842857487956048

In [53]:
lr_study.best_params

{'solver': 'saga',
 'C': 0.05993945364272753,
 'tol': 3.696312820565688e-05,
 'max_iter': 694,
 'penalty_saga': 'l1',
 'l1_ratio': 0.20997642813431616}

## Finetuning XGBoost

In [16]:
# we already have fintuned params for xgboost as following
xgb_params = {'n_estimators': 150,
                 'max_depth': 4,
                 'learning_rate': 0.10513333456836436,
                 'min_child_weight': 8,
                 'gamma': 0.002126415095687836,
                 'subsample': 0.5470092690254375,
                 'colsample_bytree': 0.6296509957360737,
                 'reg_alpha': 0.0003558325520757654,
                 'reg_lambda': 0.6330475930599759}

## Finetuning LightGBM

In [57]:
from optuna.integration import LightGBMPruningCallback

def objective(trial, X, y):
    param_grid = {
#         "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "is_unbalance": True,
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "num_rounds": trial.suggest_int("num_rounds", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "max_depth": trial.suggest_int("max_depth", 2, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 1000),
        "lambda_l1": trial.suggest_loguniform('lambda_l1', 0.00001, 1.0),
        "lambda_l2": trial.suggest_loguniform('lambda_l2', 0.00001, 1.0),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction":  trial.suggest_loguniform('bagging_fraction', 0.2, 1.0),
        "feature_fraction": trial.suggest_loguniform('feature_fraction', 0.2, 1.0),
    }

    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

    cv_scores = np.empty(8)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgbm.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=100,
            verbose=0,
            callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ],  # Add a pruning callback
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
    auc = np.mean(cv_scores)
    print(f"AVG CV AUC: \t {auc}")

    return auc

In [None]:
study = optuna.create_study(direction="maximize", study_name="LGBM Tuning")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=100, show_progress_bar=True)

In [59]:
study.best_value

0.8846192006128957

In [60]:
study.best_params

{'n_estimators': 1449,
 'num_rounds': 150,
 'learning_rate': 0.2487851433195791,
 'num_leaves': 229,
 'max_depth': 4,
 'min_data_in_leaf': 903,
 'lambda_l1': 1.6076796916985906e-05,
 'lambda_l2': 0.0006193243147464635,
 'min_gain_to_split': 5.754065628334817,
 'bagging_fraction': 0.8222441421542225,
 'feature_fraction': 0.9652642961816826}

In [61]:
lgbm_params = study.best_params

# CatBoost

In [71]:
def objective(trial, X, y):
    param = {
        "loss_function": trial.suggest_categorical("loss_function", ["CrossEntropy"]),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    # Conditional Hyper-Parameters
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

    cv_scores = np.empty(8)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        cat_model = catboost.CatBoostClassifier(**param)
        cat_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)
        
        y_preds = cat_model.predict_proba(X_test)[:, 1]
        cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
    auc = np.mean(cv_scores)
    print(f"AVG CV AUC: \t {auc}")
    return auc

In [72]:
cat_study = optuna.create_study(direction="maximize", study_name="catboost tuning")
func = lambda trial: objective(trial, X, y)
cat_study.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-01-14 18:25:51,007][0m A new study created in memory with name: catboost tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

AVG CV AUC: 	 0.8348826836271275
[32m[I 2023-01-14 18:26:08,550][0m Trial 0 finished with value: 0.8348826836271275 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.0001660976380013114, 'l2_leaf_reg': 0.015245735436019528, 'colsample_bylevel': 0.018810974634979487, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 15, 'one_hot_max_size': 15, 'subsample': 0.8109421023773793}. Best is trial 0 with value: 0.8348826836271275.[0m
AVG CV AUC: 	 0.8755961583589857
[32m[I 2023-01-14 18:26:12,805][0m Trial 1 finished with value: 0.8755961583589857 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.3990474388394784, 'l2_leaf_reg': 0.29847285009274865, 'colsample_bylevel': 0.04570271539215538, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'min_data_in_leaf': 15, 'one_hot_max_size': 4}. Best is trial 1 with value: 0.8755961583589857.[0m
AVG CV AUC: 	 0.8544372817249424
[32m[I 2023-01-14 18:26:36,95

In [73]:
cat_study.best_value

0.8894850951782781

In [74]:
cat_study.best_params

{'loss_function': 'CrossEntropy',
 'learning_rate': 0.6917811416780869,
 'l2_leaf_reg': 0.09057687056905092,
 'colsample_bylevel': 0.04522494561718612,
 'depth': 1,
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS',
 'min_data_in_leaf': 18,
 'one_hot_max_size': 8}

### Before we implement stacking, let's reserve a validation set for validation/testing the meta leaners's predictions before we make the submission because the test set that comes with compeition has no y.