# 1. Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import seaborn as sns
from tqdm.notebook import tqdm
from typing import Optional

import category_encoders
from category_encoders import WOEEncoder
from optuna import Trial
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from utils.eda import summary

# 2. Notebook settings

In [2]:
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.2f}'.format

# 3. Data Load

In [3]:
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 4. Data Cleaning 

In [4]:
cat_cols = [column for column in train_df.columns if train_df[column].dtype == 'object']
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()

num_cols.remove('id')
num_cols.remove('CustomerId')
num_cols.remove('IsActiveMember')
num_cols.remove('HasCrCard')
num_cols.remove('Exited')
cat_cols.remove('Surname')
cat_cols.append('IsActiveMember')
cat_cols.append('HasCrCard')

train_df['HasCrCard'] = train_df['HasCrCard'].astype(bool)
train_df['IsActiveMember'] = train_df['IsActiveMember'].astype(bool)
test_df['HasCrCard'] = test_df['HasCrCard'].astype(bool)
test_df['IsActiveMember'] = test_df['IsActiveMember'].astype(bool)
train_df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,True,False,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,True,True,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,True,False,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,True,True,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,True,True,15068.83,0


# 5. Model Setup

In [5]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for column in self.columns:
            X[column] = np.log1p(X[column])
        return X

In [11]:
def instantiate_woe_encoder(trial : Trial) -> WOEEncoder:
  params = {
    'sigma': trial.suggest_float('sigma', 0.001, 5),
    'regularization': trial.suggest_float('regularization', 0, 5),
    'randomized': trial.suggest_categorical('randomized', [True, False])
  }
  return WOEEncoder(**params)

def instantiate_robust_scaler(trial : Trial) -> RobustScaler:
  params = {
    'with_centering': trial.suggest_categorical(
      'with_centering', [True, False]
    ),
    'with_scaling': trial.suggest_categorical(
      'with_scaling', [True, False]
    )
  }
  return RobustScaler(**params)

def instantiate_catboost(trial : Trial) -> CatBoostClassifier:
  params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        'random_strength': trial.suggest_float('random_strength',1e-6,10,log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',1e-8,100,log=True),
        'model_size_reg': trial.suggest_float('model_size_reg',1e-8,100,log=True),
    }
  return CatBoostClassifier(**params, silent=True)

def instantiate_logistic(trial : Trial) -> LogisticRegression:
  params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
  }
  return LogisticRegression(**params)

def instantiate_xgboost(trial : Trial) -> XGBClassifier:
  params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
  }
  return XGBClassifier(**params)

def instantiate_lgbm(trial : Trial) -> LGBMClassifier:
  params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
  }
  return LGBMClassifier(**params)

def instantiate_numerical_pipeline(trial : Trial, log_columns: list[int]) -> Pipeline:
  pipeline = Pipeline([
    ('scaler', instantiate_robust_scaler(trial)),
    ('log_transformer', LogTransformer(columns=log_columns))
  ])
  return pipeline

def instantiate_categorical_pipeline(trial : Trial) -> Pipeline:
  pipeline = Pipeline([
    ('encoder', instantiate_woe_encoder(trial))
  ])
  return pipeline

In [13]:
from sklearn.compose import ColumnTransformer

def instantiate_processor(trial : Trial, numerical_columns : list[str], categorical_columns : list[str], log_columns : list[int]) -> ColumnTransformer:
  
  numerical_pipeline = instantiate_numerical_pipeline(trial, log_columns)
  categorical_pipeline = instantiate_categorical_pipeline(trial)
  
  processor = ColumnTransformer([
    ('numerical_pipeline', numerical_pipeline, numerical_columns),
    ('categorical_pipeline', categorical_pipeline, categorical_columns)
  ])
  
  return processor

def instantiate_model(trial : Trial, numerical_columns : list[str], categorical_columns : list[str], algo: str, log_columns : list[int]) -> Pipeline:
  
  processor = instantiate_processor(
    trial, numerical_columns, categorical_columns, log_columns
  )
  
  if algo == 'cat':
    model = instantiate_catboost(trial)
  elif algo == 'log':
    model = instantiate_logistic(trial)
  elif algo == 'xgb':
    model = instantiate_xgboost(trial)
  elif algo == 'lgbm':
    model = instantiate_lgbm(trial)
  
  model = Pipeline([
    ('processor', processor),
    ('extra_trees', model)
  ])
  
  return model

In [14]:
def objective(trial : Trial, X : DataFrame, y : np.ndarray | Series, algo: str, log_columns : list[int], 
              numerical_columns : Optional[list[str]]=None, categorical_columns : Optional[list[str]]=None, 
              random_state : int=42) -> float:
  if numerical_columns is None:
    numerical_columns = [
      *X.select_dtypes(exclude=['object', 'category']).columns
    ]
  
  if categorical_columns is None:
    categorical_columns = [
      *X.select_dtypes(include=['object', 'category']).columns
    ]
  
  model = instantiate_model(trial, numerical_columns, categorical_columns, algo, log_columns)
  
  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
  roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
  scores = cross_val_score(model, X, y, scoring=roc_auc_scorer, cv=skf)
  
  return np.min([np.mean(scores), np.median([scores])])

# 6. Training

In [15]:
from optuna import create_study

study = create_study(study_name='optimization', direction='maximize')

study.optimize(lambda trial: objective(trial, train_df[cat_cols + num_cols], train_df['Exited'], algo='cat', log_columns=[1]), n_trials=2)

[I 2024-01-14 22:29:26,693] A new study created in memory with name: optimization
[I 2024-01-14 22:30:09,639] Trial 0 finished with value: 0.8728274874165197 and parameters: {'with_centering': True, 'with_scaling': False, 'sigma': 4.110232524092529, 'regularization': 3.143278540701757, 'randomized': True, 'learning_rate': 0.0011332590256154654, 'depth': 7, 'subsample': 0.6690558191065267, 'random_strength': 0.18849092173420276, 'l2_leaf_reg': 1.615865269643412, 'model_size_reg': 0.001696287441968287}. Best is trial 0 with value: 0.8728274874165197.
[I 2024-01-14 22:30:46,992] Trial 1 finished with value: 0.8720656905013843 and parameters: {'with_centering': True, 'with_scaling': True, 'sigma': 4.889006392960161, 'regularization': 2.987733713138287, 'randomized': True, 'learning_rate': 0.0020314235382989177, 'depth': 5, 'subsample': 0.9609626262304646, 'random_strength': 7.548172263710883e-05, 'l2_leaf_reg': 0.08577379132458979, 'model_size_reg': 0.0010747921920229652}. Best is trial 0 

# 7. Model evaluation

In [16]:
study.best_params
best_trial = study.best_trial

In [17]:
model = instantiate_model(best_trial, num_cols, cat_cols, algo='cat', log_columns=[1])
model.fit(train_df[cat_cols + num_cols], train_df['Exited'])

In [18]:
# Check for underfitting/overfitting by analyzing splits
# folds = range(1, kf.get_n_splits() + 1)
# plt.plot(folds, rmse_train, 'o-', color='green', label='train')
# plt.plot(folds, rmse_test, 'o-', color='red', label='test')
# plt.legend()
# plt.grid()
# plt.xlabel('Number of fold')
# plt.ylabel('mean squared error')
# plt.show()

In [19]:
test_probabilities = model.predict_proba(test_df[cat_cols + num_cols])[:, 1]
test_preds = model.predict(test_df[cat_cols + num_cols])

In [20]:
test_df['predictions'] = test_preds
test_df.head()

cutoff = 0.95 # Probability CutOff...
pseudo_set_1 = test_df[test_df['predictions'] > cutoff]
pseudo_set_1['Exited'] = 1
pseudo_set_1.drop(columns=['predictions'], axis = 1, inplace=True)

pseudo_set_2 = test_df[test_df['predictions'] < 1 - cutoff]
pseudo_set_2['Exited'] = 0
pseudo_set_2.drop(columns=['predictions'], axis = 1, inplace=True)

pseudo_set = pd.concat([pseudo_set_1,pseudo_set_2])
pseudo_set.shape

(110023, 14)

In [21]:
pseudo_train_df = pd.concat([train_df, pseudo_set])
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)
scores = cross_val_score(model, pseudo_train_df[cat_cols + num_cols], pseudo_train_df['Exited'], scoring=roc_auc_scorer, cv=skf)
np.min([np.mean(scores), np.median([scores])])

KeyboardInterrupt: 

In [22]:
model.fit(pseudo_train_df[cat_cols + num_cols], pseudo_train_df['Exited'])
test_probabilities = model.predict_proba(test_df[cat_cols + num_cols])[:,1]

In [23]:
sub = pd.DataFrame()
sub['id'] = test_df['id']
sub['Exited'] = test_probabilities
sub = sub.set_index('id')
sub.head()

Unnamed: 0_level_0,Exited
id,Unnamed: 1_level_1
165034,0.03
165035,0.3
165036,0.06
165037,0.28
165038,0.22


In [24]:
sub.to_csv('submission.csv')