# TODO:
* Implement stacking (ensembling technique)
* Use xgboost, catboost, lightgbm and keras and logistic regression as base models
* Use linear regression with regularization (fine tune the regularization parameter with optuna) and tackle the task as regression with target values either being 0 or 1 and feature values of meta data being the probabilities predicted by our base models

# REMOVE ALL THE NEW ADDED FEATURES
## The keras tuned model achieved a score of 87.7 with no feature engineering

In [2]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import optuna
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [3]:
from warnings import filterwarnings
filterwarnings('ignore')

# Loading Data

In [4]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e2/")

train = pd.read_csv(BASE_DIR / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_DIR / "test.csv").drop(columns="id")

# Preprocessing

In [5]:
df = pd.concat([train.drop(columns=["stroke"]), test], axis=0).reset_index(drop=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked


# Encoding - One hot

In [6]:
df = pd.get_dummies(df)
df.head(3)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,28.0,0,0,79.53,31.1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,33.0,0,0,78.44,23.9,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
2,42.0,0,0,103.0,40.3,1,0,0,0,1,...,0,1,0,0,1,0,1,0,0,0


# Preprocessing v2

In [7]:
X = df.iloc[:-len(test), :]
test_new = df.iloc[-len(test):, :]
y = train.stroke

In [None]:
type(X)

# Before we ensebmle, let's first finetune individual models and to get the optimized parameters for them

In [8]:
def cross_validate(X, y, model):
    kf = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)
    
    cv_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                
        # training
        model.fit(X_train, y_train, verbose=0)

        # predicting
        y_pred = model.predict_proba(X_val)[:, 1]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold} \t auc: {auc}")
        
        cv_scores.append(auc)
    
    avg_auc = np.mean(cv_scores)
    print(f"Avg AUC: {avg_auc}")

## Finetuning Logsitic Regression

In [9]:
# def objective(trial, X, y):
#     params = {
#         "solver": trial.suggest_categorical("solver", ["liblinear", "newton-cg", "saga", "lbfgs"]),
#         "C": trial.suggest_loguniform("C", 0.01, 100.0),
#         "tol": trial.suggest_loguniform("tol", 1e-6, 1e-2),
#         "max_iter": trial.suggest_int("max_iter", 100, 1000),
#     }
        
#     if params["solver"] in ["newton-cg", "lbfgs"]:
#         params["penalty"] = "l2"
        
#     elif params["solver"] == "liblinear":
#         params["penalty"] =  trial.suggest_categorical("penalty_liblinear", ["l1","l2"])
    
#     elif params["solver"] == "saga":
#         params["penalty"] =  trial.suggest_categorical("penalty_saga", ["l1","l2","elasticnet"])
        
#         # saga also needs anothe parameter l1_ratio which ranges in 0-1
#         params["l1_ratio"] = trial.suggest_loguniform("l1_ratio", 0.00001, 1)
        
#         # saga also likes it's features to be scaled to converge so let's scale our data
#         sc = StandardScaler()
#         X = sc.fit_transform(X)
        
#     # lets cross validate
#     cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)
    
#     cv_scores = []
#     for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         # since we scale our dataset also, that converts it into a numpy ndarray
#         # and hence it won't have iloc func that pandas dataframe has.
#         # so let's check for the datatype of X first and then slice
#         if type(X) == pd.core.frame.DataFrame:
#             X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#             y_train, y_test = y[train_idx], y[test_idx]
#         else:
#             X_train, X_test = X[train_idx], X[test_idx]
#             y_train, y_test = y[train_idx], y[test_idx]
            
        
#         model = LogisticRegression(**params)       
#         model.fit(X_train, y_train)
        
#         y_pred = model.predict_proba(X_test)[:, 1]
#         cv_scores.append(roc_auc_score(y_test, y_pred))
    
#     auc = np.mean(cv_scores)
#     return auc

In [10]:
# lr_study = optuna.create_study(direction="maximize", study_name="linear regression tuning")
# func = lambda trial: objective(trial, X, y)
# lr_study.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-01-15 14:03:59,120][0m A new study created in memory with name: linear regression tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2023-01-15 14:04:03,540][0m Trial 0 finished with value: 0.8832357853760888 and parameters: {'solver': 'newton-cg', 'C': 3.2028242020646847, 'tol': 0.0013055534435784677, 'max_iter': 442}. Best is trial 0 with value: 0.8832357853760888.[0m
[32m[I 2023-01-15 14:04:08,621][0m Trial 1 finished with value: 0.883240961859669 and parameters: {'solver': 'newton-cg', 'C': 5.820092204893791, 'tol': 0.004606852242096995, 'max_iter': 911}. Best is trial 1 with value: 0.883240961859669.[0m
[32m[I 2023-01-15 14:04:10,530][0m Trial 2 finished with value: 0.858588821556258 and parameters: {'solver': 'liblinear', 'C': 0.020367227539703017, 'tol': 2.5334999744522436e-05, 'max_iter': 734, 'penalty_liblinear': 'l1'}. Best is trial 1 with value: 0.883240961859669.[0m
[32m[I 2023-01-15 14:04:28,469][0m Trial 3 finished with value: 0.8833229228496887 and parameters: {'solver': 'saga', 'C': 0.737476710003354, 'tol': 0.00023418637012104457, 'max_iter': 718, 'penalty_saga': 'l2', 'l1_ratio': 

In [11]:
# lr_study.best_value

0.8837284140634706

In [12]:
# lr_study.best_params

{'solver': 'saga',
 'C': 0.07655860626453208,
 'tol': 0.00011268918749381429,
 'max_iter': 484,
 'penalty_saga': 'l1',
 'l1_ratio': 0.012007897837476486}

## Finetuning XGBoost

In [None]:
# # we already have fintuned params for xgboost as following
# xgb_params = {'n_estimators': 272,
#                  'max_depth': 4,
#                  'learning_rate': 0.07360332417334109,
#                  'min_child_weight': 1,
#                  'gamma': 0.05180472121817407,
#                  'subsample': 0.41981696676590474,
#                  'colsample_bytree': 0.730818203141452,
#                  'reg_alpha': 0.0005268315833160329,
#                  'reg_lambda': 0.0007997627184403383}

## Finetuning LightGBM

In [None]:
# from optuna.integration import LightGBMPruningCallback

# def objective(trial, X, y):
#     param_grid = {
# #         "device_type": trial.suggest_categorical("device_type", ['gpu']),
#         "is_unbalance": True,
#         "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
#         "num_rounds": trial.suggest_int("num_rounds", 100, 500),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "num_leaves": trial.suggest_int("num_leaves", 20, 300),
#         "max_depth": trial.suggest_int("max_depth", 2, 12),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 1000),
#         "lambda_l1": trial.suggest_loguniform('lambda_l1', 0.00001, 1.0),
#         "lambda_l2": trial.suggest_loguniform('lambda_l2', 0.00001, 1.0),
#         "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
#         "bagging_fraction":  trial.suggest_loguniform('bagging_fraction', 0.2, 1.0),
#         "feature_fraction": trial.suggest_loguniform('feature_fraction', 0.2, 1.0),
#     }

#     cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

#     cv_scores = np.empty(8)
#     for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]

#         model = lgbm.LGBMClassifier(objective="binary", **param_grid)
#         model.fit(
#             X_train,
#             y_train,
#             eval_set=[(X_test, y_test)],
#             eval_metric="auc",
#             early_stopping_rounds=100,
#             verbose=0,
#             callbacks=[
#                 LightGBMPruningCallback(trial, "auc")
#             ],  # Add a pruning callback
#         )
#         y_preds = model.predict_proba(X_test)[:, 1]
#         cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
#     auc = np.mean(cv_scores)
#     print(f"AVG CV AUC: \t {auc}")

#     return auc

In [None]:
# study = optuna.create_study(direction="maximize", study_name="LGBM Tuning")
# func = lambda trial: objective(trial, X, y)
# study.optimize(func, n_trials=100, show_progress_bar=True)

In [None]:
# study.best_value

In [None]:
# study.best_params

In [None]:
lgbm_params = study.best_params

# CatBoost

In [14]:
# def objective(trial, X, y):
#     param = {
#         "loss_function": trial.suggest_categorical("loss_function", ["CrossEntropy"]),
#         "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
#         "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
#         "depth": trial.suggest_int("depth", 1, 10),
#         "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#         "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
#         "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
#     }
#     # Conditional Hyper-Parameters
#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
#     cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

#     cv_scores = np.empty(8)
#     for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]

#         cat_model = catboost.CatBoostClassifier(**param)
#         cat_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)
        
#         y_preds = cat_model.predict_proba(X_test)[:, 1]
#         cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
#     auc = np.mean(cv_scores)
#     print(f"AVG CV AUC: \t {auc}")
#     return auc

In [15]:
# cat_study = optuna.create_study(direction="maximize", study_name="catboost tuning")
# func = lambda trial: objective(trial, X, y)
# cat_study.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-01-15 14:32:46,376][0m A new study created in memory with name: catboost tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

AVG CV AUC: 	 0.8433121557638419
[32m[I 2023-01-15 14:34:00,968][0m Trial 0 finished with value: 0.8433121557638419 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.00038667366894423077, 'l2_leaf_reg': 0.28344531162113895, 'colsample_bylevel': 0.035192082523750806, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'min_data_in_leaf': 16, 'one_hot_max_size': 19}. Best is trial 0 with value: 0.8433121557638419.[0m
AVG CV AUC: 	 0.8427159974048563
[32m[I 2023-01-15 14:35:06,903][0m Trial 1 finished with value: 0.8427159974048563 and parameters: {'loss_function': 'CrossEntropy', 'learning_rate': 0.004295360948863174, 'l2_leaf_reg': 0.38536540688936655, 'colsample_bylevel': 0.02657439645438508, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 5, 'one_hot_max_size': 20, 'bagging_temperature': 7.189931196909804}. Best is trial 0 with value: 0.8433121557638419.[0m
AVG CV AUC: 	 0.8868731278384384
[32m[I 2023-01-

In [16]:
# cat_study.best_value

0.8886163086840688

In [17]:
# cat_study.best_params

{'loss_function': 'CrossEntropy',
 'learning_rate': 0.7611165319863433,
 'l2_leaf_reg': 0.014250721124770624,
 'colsample_bylevel': 0.061218122768953775,
 'depth': 1,
 'boosting_type': 'Plain',
 'bootstrap_type': 'Bernoulli',
 'min_data_in_leaf': 18,
 'one_hot_max_size': 14,
 'subsample': 0.9972814280760031}

### Before we implement stacking, let's reserve a validation set for validation/testing the meta leaners's predictions before we make the submission because the test set that comes with compeition has no y.

In [19]:
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, shuffle=True, test_size=0.2, stratify=y)

In [20]:
# sanity check
len(X_train) == len(y_train)

True

# Stacking (finally!)

### Let's set best parametes for each model here

In [23]:
# Tuned XGBoost Params
xgb_params = {'n_estimators': 272,
                 'max_depth': 4,
                 'learning_rate': 0.07360332417334109,
                 'min_child_weight': 1,
                 'gamma': 0.05180472121817407,
                 'subsample': 0.41981696676590474,
                 'colsample_bytree': 0.730818203141452,
                 'reg_alpha': 0.0005268315833160329,
                 'reg_lambda': 0.0007997627184403383}


# Tuned LightGBM Params
lgbm_params = {'n_estimators': 1942,
                 'num_rounds': 477,
                 'learning_rate': 0.2870761124159734,
                 'num_leaves': 37,
                 'max_depth': 6,
                 'min_data_in_leaf': 368,
                 'lambda_l1': 0.00017451291663562305,
                 'lambda_l2': 0.5090553595978456,
                 'min_gain_to_split': 0.02547126174774228,
                 'bagging_fraction': 0.20531850278394478,
                 'feature_fraction': 0.37437811030015083}


# Tuned Logistic Regression params
lr_params = {'solver': 'saga',
                 'C': 0.07655860626453208,
                 'tol': 0.00011268918749381429,
                 'max_iter': 484,
                 'penalty': 'l1',
                 'l1_ratio': 0.012007897837476486}


# Tuned CatBoost params
cat_params = {'loss_function': 'CrossEntropy',
                 'learning_rate': 0.7611165319863433,
                 'l2_leaf_reg': 0.014250721124770624,
                 'colsample_bylevel': 0.061218122768953775,
                 'depth': 1,
                 'boosting_type': 'Plain',
                 'bootstrap_type': 'Bernoulli',
                 'min_data_in_leaf': 18,
                 'one_hot_max_size': 14,
                 'subsample': 0.9972814280760031}

In [24]:
base_learners = []

xgb_model = xgb.XGBClassifier(**xgb_params)
base_learners.append(xgb_model)

lgbm_model = lgbm.LGBMClassifier(objective="binary", is_unbalance=True, **lgbm_params)
base_learners.append(lgbm_model)

cat_model = catboost.CatBoostClassifier(**cat_params)
base_learners.append(cat_model)

lr_model = LogisticRegression(**lr_params)
base_learners.append(lr_model)