# TODO:
* #### Tune XGBoost wihtout any features

In [15]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import optuna
from optuna.samplers import TPESampler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [16]:
import warnings
warnings.filterwarnings('ignore')

# Utils

In [17]:
def plot_feature_importances(cols, feat_imps):
    fig = plt.figure(figsize = (15, 0.35*len(feat_imps)))
    feature_imp_df = pd.DataFrame(data=zip(cols, feat_imps), columns=["feature", "importance"]).sort_values(by="importance", ascending=False)
    plt.title('Feature importances', size=25, y=1.05)
    sns.barplot(data=feature_imp_df, x='importance', y='feature')
    plt.show()

# Loading Data

In [18]:
BASE_DIR = Path("/kaggle/input/playground-series-s3e2/")

train = pd.read_csv(BASE_DIR / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_DIR / "test.csv").drop(columns="id")

# Preprocessing

In [19]:
df = pd.concat([train.drop(columns=["stroke"]), test], axis=0).reset_index(drop=True)
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked
1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked
2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown
3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked
4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked


In [20]:
df = pd.get_dummies(df)
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,28.0,0,0,79.53,31.1,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,33.0,0,0,78.44,23.9,0,1,0,0,1,...,0,1,0,0,1,0,0,1,0,0
2,42.0,0,0,103.0,40.3,1,0,0,0,1,...,0,1,0,0,1,0,1,0,0,0
3,56.0,0,0,64.87,28.8,0,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
4,24.0,0,0,73.36,28.8,1,0,0,1,0,...,0,1,0,0,1,0,0,0,1,0


In [21]:
X = df.iloc[:-len(test), :]
test_new = df.iloc[-len(test):, :]
y = train.stroke

# Tuning XGBoost - without feature engineering

In [22]:
def objective(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 0.00001, 0.5),
        'subsample': trial.suggest_loguniform('subsample', 0.2, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.2, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 0.00001, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 0.00001, 1.0)
    }
    

    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

    cv_scores = np.empty(8)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = xgb.XGBClassifier(**params)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=50,
            verbose=0,
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
    auc = np.mean(cv_scores)
    print(f"AVG CV AUC: \t {auc}")

    return auc

In [None]:
study = optuna.create_study(study_name="xgboost_tuning", direction="maximize")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=100)

In [None]:
study.best_value

In [None]:
study.best_params

# Tuning LightGBM

In [38]:
from optuna.integration import LightGBMPruningCallback

def objective_lgbm(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "num_rounds": trial.suggest_int("num_rounds", 100, 500),
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "max_depth": trial.suggest_int("max_depth", 2, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 1000),
        "lambda_l1": trial.suggest_loguniform('lambda_l1', 0.00001, 1.0),
        "lambda_l2": trial.suggest_loguniform('lambda_l2', 0.00001, 1.0),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction":  trial.suggest_loguniform('bagging_fraction', 0.2, 1.0),
        "feature_fraction": trial.suggest_loguniform('feature_fraction', 0.2, 1.0),
    }

    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

    cv_scores = np.empty(8)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgbm.LGBMClassifier(objective="binary", is_unbalance=True, **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            early_stopping_rounds=100,
            verbose=0,
            callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ],  # Add a pruning callback
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
    auc = np.mean(cv_scores)
    print(f"AVG CV AUC: \t {auc}")
    return auc

In [43]:
study_lgbm = optuna.create_study(direction="maximize", study_name="LGBM Tuning")
func = lambda trial: objective_lgbm(trial, X, y)
study_lgbm.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-01-15 14:53:59,322][0m A new study created in memory with name: LGBM Tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

AVG CV AUC: 	 0.8729958381072016
[32m[I 2023-01-15 14:54:00,427][0m Trial 0 finished with value: 0.8729958381072016 and parameters: {'n_estimators': 1305, 'num_rounds': 353, 'learning_rate': 0.11771950798040598, 'num_leaves': 271, 'max_depth': 5, 'min_data_in_leaf': 591, 'lambda_l1': 0.06125275318636127, 'lambda_l2': 0.0010430465970611143, 'min_gain_to_split': 8.878183575866514, 'bagging_fraction': 0.9076821854605938, 'feature_fraction': 0.941217376909169}. Best is trial 0 with value: 0.8729958381072016.[0m
AVG CV AUC: 	 0.8780735371257402
[32m[I 2023-01-15 14:54:01,949][0m Trial 1 finished with value: 0.8780735371257402 and parameters: {'n_estimators': 1580, 'num_rounds': 391, 'learning_rate': 0.2907929538916645, 'num_leaves': 202, 'max_depth': 10, 'min_data_in_leaf': 811, 'lambda_l1': 6.719891433308238e-05, 'lambda_l2': 0.1143808516670079, 'min_gain_to_split': 5.141876950602242, 'bagging_fraction': 0.24723772673381383, 'feature_fraction': 0.4999613423952636}. Best is trial 1 wit

In [44]:
study_lgbm.best_value

0.8850717115525308

In [45]:
study_lgbm.best_params

{'n_estimators': 1942,
 'num_rounds': 477,
 'learning_rate': 0.2870761124159734,
 'num_leaves': 37,
 'max_depth': 6,
 'min_data_in_leaf': 368,
 'lambda_l1': 0.00017451291663562305,
 'lambda_l2': 0.5090553595978456,
 'min_gain_to_split': 0.02547126174774228,
 'bagging_fraction': 0.20531850278394478,
 'feature_fraction': 0.37437811030015083}

# Tuning RandomForestClassifier

In [51]:
def objective_rf(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_features": trial.suggest_int("max_features", 5, 15),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "max_depth": trial.suggest_int("max_depth", 2, 20),
    }

    cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=1337)

    cv_scores = np.empty(8)
    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = RandomForestClassifier(**param_grid)
        model.fit(
            X_train,
            y_train
        )
        y_preds = model.predict_proba(X_test)[:, 1]
        cv_scores[fold] = roc_auc_score(y_test, y_preds)
    
    auc = np.mean(cv_scores)
    print(f"AVG CV AUC: \t {auc}")
    return auc

In [52]:
study_rf = optuna.create_study(direction="maximize", study_name="Random Forest Tuning")
func = lambda trial: objective_rf(trial, X, y)
study_rf.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-01-15 15:07:22,574][0m A new study created in memory with name: Random Forest Tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

AVG CV AUC: 	 0.8856109285921345
[32m[I 2023-01-15 15:07:37,554][0m Trial 0 finished with value: 0.8856109285921345 and parameters: {'n_estimators': 107, 'max_features': 11, 'learning_rate': 16, 'min_samples_leaf': 2, 'max_depth': 8}. Best is trial 0 with value: 0.8856109285921345.[0m
AVG CV AUC: 	 0.8771840446972102
[32m[I 2023-01-15 15:08:09,276][0m Trial 1 finished with value: 0.8771840446972102 and parameters: {'n_estimators': 216, 'max_features': 9, 'learning_rate': 16, 'min_samples_leaf': 5, 'max_depth': 20}. Best is trial 0 with value: 0.8856109285921345.[0m
AVG CV AUC: 	 0.8816742128293971
[32m[I 2023-01-15 15:08:36,639][0m Trial 2 finished with value: 0.8816742128293971 and parameters: {'n_estimators': 230, 'max_features': 7, 'learning_rate': 16, 'min_samples_leaf': 2, 'max_depth': 13}. Best is trial 0 with value: 0.8856109285921345.[0m
AVG CV AUC: 	 0.8780614586640532
[32m[I 2023-01-15 15:09:35,632][0m Trial 3 finished with value: 0.8780614586640532 and parameters:

In [53]:
study_rf.best_value

0.8883320334607898

In [54]:
study_rf.best_params

{'n_estimators': 390,
 'max_features': 14,
 'learning_rate': 19,
 'min_samples_leaf': 2,
 'max_depth': 5}