# 1. Imports

In [42]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
from tqdm.notebook import tqdm
from typing import Optional

import category_encoders
from category_encoders import WOEEncoder
from optuna import Trial
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from utils.eda import summary

# 2. Notebook settings

In [2]:
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.2f}'.format

# 3. Data Load

In [3]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 4. Data Cleaning 

In [4]:
cat_cols = [column for column in train_df.columns if train_df[column].dtype == 'object']
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

num_cols.remove('id')
num_cols.remove('CustomerId')
num_cols.remove('IsActiveMember')
num_cols.remove('HasCrCard')
num_cols.remove('Exited')
cat_cols.remove('Surname')
cat_cols.append('IsActiveMember')
cat_cols.append('HasCrCard')

train_df['HasCrCard'] = train_df['HasCrCard'].astype(bool)
train_df['IsActiveMember'] = train_df['IsActiveMember'].astype(bool)
test_df['HasCrCard'] = test_df['HasCrCard'].astype(bool)
test_df['IsActiveMember'] = test_df['IsActiveMember'].astype(bool)
train_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,True,False,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,True,True,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,True,False,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,True,True,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,True,True,15068.83,0


# 5. Model Setup

In [55]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for column in self.columns:
            X[column] = np.log1p(X[column])
        return X

In [65]:
def instantiate_woe_encoder(trial : Trial) -> WOEEncoder:
  params = {
    'sigma': trial.suggest_float('sigma', 0.001, 5),
    'regularization': trial.suggest_float('regularization', 0, 5),
    'randomized': trial.suggest_categorical('randomized', [True, False])
  }
  return WOEEncoder(**params)

def instantiate_robust_scaler(trial : Trial) -> RobustScaler:
  params = {
    'with_centering': trial.suggest_categorical(
      'with_centering', [True, False]
    ),
    'with_scaling': trial.suggest_categorical(
      'with_scaling', [True, False]
    )
  }
  return RobustScaler(**params)

def instantiate_catboost(trial : Trial) -> CatBoostClassifier:
  params = {
        "iterations": 500,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
  return CatBoostClassifier(**params, silent = True)

def instantiate_logistic(trial : Trial) -> LogisticRegression:
  params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
  }
  return LogisticRegression(**params)

def instantiate_xgboost(trial : Trial) -> XGBClassifier:
  params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
  }
  return XGBClassifier(**params)

def instantiate_lgbm(trial : Trial) -> LGBMClassifier:
  params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
  }
  return LGBMClassifier(**params)

def instantiate_numerical_pipeline(trial : Trial, log_columns: list[int]) -> Pipeline:
  pipeline = Pipeline([
    ('scaler', instantiate_robust_scaler(trial)),
    ('log_transformer', LogTransformer(columns=log_columns))
  ])
  return pipeline

def instantiate_categorical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('encoder', instantiate_woe_encoder(trial))
  ])
  return pipeline

In [57]:
from sklearn.compose import ColumnTransformer

def instantiate_processor(trial : Trial, numerical_columns : list[str], categorical_columns : list[str], log_columns : list[int]) -> ColumnTransformer:
  
  numerical_pipeline = instantiate_numerical_pipeline(trial, log_columns)
  categorical_pipeline = instantiate_categorical_pipeline(trial)
  
  processor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_columns),
    ('categorical_pipeline', categorical_pipeline, categorical_columns)
  ])
  
  return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], categorical_columns : list[str], algo: str, log_columns : list[int]) -> Pipeline:
  
  processor = instantiate_processor(
    trial, numerical_columns, categorical_columns, log_columns
  )
  
  if algo == 'cat':
    model = instantiate_catboost(trial)
  elif algo == 'log':
    model = instantiate_logistic(trial)
  elif algo == 'xgb':
    model = instantiate_xgboost(trial)
  elif algo == 'lgbm':
    model = instantiate_lgbm(trial)
  
  model = Pipeline([
    ('processor', processor),
    ('extra_trees', model)
  ])
  
  return model

In [63]:
def objective(trial : Trial, X : DataFrame, y : np.ndarray | Series, algo: str, log_columns : list[int], 
              numerical_columns : Optional[list[str]]=None, categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
  if numerical_columns is None:
    numerical_columns = [
      *X.select_dtypes(exclude=['object', 'category']).columns
    ]
  
  if categorical_columns is None:
    categorical_columns = [
      *X.select_dtypes(include=['object', 'category']).columns
    ]
  
  model = instantiate_model(trial, numerical_columns, categorical_columns, algo, log_columns)
  
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
  roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
  scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=skf)
  
  return np.min([np.mean(scores), np.median([scores])])

# 6. Training

In [69]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

study.optimize(lambda trial: objective(trial, train_df[cat_cols + num_cols], train_df['Exited'], algo='cat', log_columns=[1]), n_trials=500)

[I 2024-01-13 20:05:23,474] A new study created in memory with name: optimization
[W 2024-01-13 20:05:55,874] Trial 0 failed with parameters: {'with_centering': True, 'with_scaling': False, 'sigma': 2.8481371291726214, 'regularization': 0.8330252997003024, 'randomized': False, 'learning_rate': 0.016682779480562366, 'depth': 10, 'subsample': 0.810704828106712, 'colsample_bylevel': 0.9188740254808442, 'min_data_in_leaf': 73} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "/Users/krystianpietrzak/Documents/ML/Kaggle/Kaggle-S4E1/.conda/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/x1/2j32gjvd4v16s6kf306sfdcw0000gn/T/ipykernel_91023/3800227230.py", line 5, in <lambda>
    study.optimize(lambda trial: objective(trial, train_df[cat_cols + num_cols], train_df['Exited'], algo='cat', log_columns=[1]), n_trials=500)
         

KeyboardInterrupt: 

# 7. Model evaluation

In [19]:
study.best_params
best_trial = study.best_trial

In [21]:
model = instantiate_model(best_trial, num_cols, cat_cols, algo='cat')
model.fit(train_df[cat_cols + num_cols], train_df['Exited'])

Learning rate set to 0.489395
0:	learn: 0.4969153	total: 4.08ms	remaining: 648ms
1:	learn: 0.4235073	total: 7.08ms	remaining: 559ms
2:	learn: 0.3848073	total: 10.1ms	remaining: 527ms
3:	learn: 0.3645509	total: 12.8ms	remaining: 500ms
4:	learn: 0.3544694	total: 15.9ms	remaining: 492ms
5:	learn: 0.3412182	total: 18.8ms	remaining: 482ms
6:	learn: 0.3372156	total: 22.2ms	remaining: 485ms
7:	learn: 0.3340531	total: 25ms	remaining: 475ms
8:	learn: 0.3296247	total: 28.1ms	remaining: 472ms
9:	learn: 0.3282180	total: 30.8ms	remaining: 462ms
10:	learn: 0.3272080	total: 33.6ms	remaining: 455ms
11:	learn: 0.3264568	total: 36.4ms	remaining: 449ms
12:	learn: 0.3258838	total: 39.3ms	remaining: 445ms
13:	learn: 0.3248434	total: 42ms	remaining: 438ms
14:	learn: 0.3245347	total: 45ms	remaining: 435ms
15:	learn: 0.3242591	total: 47.6ms	remaining: 429ms
16:	learn: 0.3239048	total: 50.5ms	remaining: 425ms
17:	learn: 0.3236409	total: 53ms	remaining: 418ms
18:	learn: 0.3232960	total: 55.8ms	remaining: 414ms


In [None]:
# Check for underfitting/overfitting by analyzing splits
# folds = range(1, kf.get_n_splits() + 1)
# plt.plot(folds, rmse_train, 'o-', color='green', label='train')
# plt.plot(folds, rmse_test, 'o-', color='red', label='test')
# plt.legend()
# plt.grid()
# plt.xlabel('Number of fold')
# plt.ylabel('mean squared error')
# plt.show()

In [23]:
test_probabilities = model.predict_proba(test_df[cat_cols + num_cols])[:, 1]
test_preds = model.predict(test_df[cat_cols + num_cols])

In [24]:
test_df['predictions'] = test_preds
test_df.head()

cutoff = 0.95 # Probability CutOff...
pseudo_set_1 = test_df[test_df['predictions'] > cutoff]
pseudo_set_1['Exited'] = 1
pseudo_set_1.drop(columns=['predictions'], axis = 1, inplace=True)

pseudo_set_2 = test_df[test_df['predictions'] < 1 - cutoff]
pseudo_set_2['Exited'] = 0
pseudo_set_2.drop(columns=['predictions'], axis = 1, inplace=True)

pseudo_set = pd.concat([pseudo_set_1,pseudo_set_2])
pseudo_set.shape

(110023, 14)

In [25]:
pseudo_train_df = pd.concat([train_df, pseudo_set])
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
scores = cross_val_score(model, pseudo_train_df[cat_cols + num_cols], pseudo_train_df['Exited'], scoring=roc_auc_scorer, cv=skf)
np.min([np.mean(scores), np.median([scores])])

Learning rate set to 0.5
0:	learn: 0.4617753	total: 8.51ms	remaining: 1.35s
1:	learn: 0.3735567	total: 13ms	remaining: 1.02s
2:	learn: 0.3319598	total: 17.3ms	remaining: 908ms
3:	learn: 0.3077432	total: 21.2ms	remaining: 826ms
4:	learn: 0.2859915	total: 25.4ms	remaining: 786ms
5:	learn: 0.2754827	total: 29.7ms	remaining: 762ms
6:	learn: 0.2687603	total: 33.8ms	remaining: 738ms
7:	learn: 0.2646252	total: 37.4ms	remaining: 711ms
8:	learn: 0.2617420	total: 41ms	remaining: 687ms
9:	learn: 0.2597441	total: 45.2ms	remaining: 678ms
10:	learn: 0.2563982	total: 50ms	remaining: 677ms
11:	learn: 0.2543478	total: 53.9ms	remaining: 665ms
12:	learn: 0.2535050	total: 57.5ms	remaining: 650ms
13:	learn: 0.2523054	total: 61.5ms	remaining: 641ms
14:	learn: 0.2516388	total: 65.9ms	remaining: 637ms
15:	learn: 0.2510569	total: 69.7ms	remaining: 628ms
16:	learn: 0.2503919	total: 73.5ms	remaining: 618ms
17:	learn: 0.2500136	total: 77.4ms	remaining: 611ms
18:	learn: 0.2497377	total: 81.3ms	remaining: 603ms
19:

0.9248317176830619

In [37]:
model.fit(pseudo_train_df[cat_cols + num_cols], pseudo_train_df['Exited'])
test_probabilities = model.predict_proba(test_df[cat_cols + num_cols])[:,1]

Learning rate set to 0.5
0:	learn: 0.4616686	total: 6.72ms	remaining: 1.07s
1:	learn: 0.3735463	total: 11.4ms	remaining: 898ms
2:	learn: 0.3319151	total: 21.7ms	remaining: 1.14s
3:	learn: 0.3077329	total: 26.9ms	remaining: 1.05s
4:	learn: 0.2859686	total: 32ms	remaining: 992ms
5:	learn: 0.2754660	total: 37ms	remaining: 949ms
6:	learn: 0.2688911	total: 41.4ms	remaining: 906ms
7:	learn: 0.2639143	total: 45.7ms	remaining: 867ms
8:	learn: 0.2613493	total: 50.2ms	remaining: 842ms
9:	learn: 0.2592178	total: 54.6ms	remaining: 819ms
10:	learn: 0.2578713	total: 58.5ms	remaining: 793ms
11:	learn: 0.2542332	total: 62.8ms	remaining: 775ms
12:	learn: 0.2528705	total: 67.5ms	remaining: 763ms
13:	learn: 0.2519566	total: 72ms	remaining: 751ms
14:	learn: 0.2513697	total: 76.2ms	remaining: 736ms
15:	learn: 0.2505395	total: 80.4ms	remaining: 724ms
16:	learn: 0.2501198	total: 85ms	remaining: 715ms
17:	learn: 0.2497521	total: 89.1ms	remaining: 703ms
18:	learn: 0.2494902	total: 93.3ms	remaining: 693ms
19:	l

In [38]:
sub = pd.DataFrame()
sub['id'] = test_df['id']
sub['Exited'] = test_probabilities
sub = sub.set_index('id')
sub.head()

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.02
165035,0.91
165036,0.01
165037,0.13
165038,0.28


In [39]:
sub.to_csv('submission.csv')