In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import optuna
from sklearn.preprocessing import StandardScaler
from scipy.linalg import norm

from category_encoders import LeaveOneOutEncoder

from IPython.display import display
from tqdm.notebook import tqdm

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [4]:
BASE_PATH = Path("/kaggle/input/playground-series-s3e4/")

train = pd.read_csv(BASE_PATH / "train.csv")
test = pd.read_csv(BASE_PATH / "test.csv")

print(f"Training dataset has {len(train)} rows with " \
                    f"{sum(train.Class) / len(train) * 100 :.2}% fraud rows.")

Training dataset has 219129 rows with 0.21% fraud rows.


In [5]:
train["Hour"] = ((train.Time // 3600) % 24).astype("int")
test["Hour"] = ((test.Time // 3600) % 24).astype("int")

def time_of_day(hour):
    if hour >=0 and hour <= 6:
        return "night"
    elif hour <= 12:
        return "morning"
    elif hour <= 18:
        return "afternoon"
    else:
        return "evening"

In [6]:
train["Hour"] = ((train.Time // 3600) % 24).astype("category")
test["Hour"] = ((test.Time // 3600) % 24).astype("category")

In [7]:
train["TimeOfDay"] = train.Hour.map(time_of_day)
test["TimeOfDay"] = test.Hour.map(time_of_day)

In [8]:
train["Day"] = (((train['Time'] // (24 * 3600)) % 7) + 1).astype("category")
test["Day"] = (((test["Time"] // (24 * 3600)) % 7) + 1).astype("category")

In [9]:
X = train.drop(columns=["id", "Class"])
y = train.Class
X_test = test.drop(columns=["id"])

In [10]:
numerical_feats = list(set(X.columns) - set(["TimeOfDay", "Day", "Hour"]))

In [11]:
# encoding
# feats_to_one_hot = ["TimeOfDay", "Day"]
feats_to_loo = ["Hour", "TimeOfDay", "Day"]

## leave one out
loo = LeaveOneOutEncoder()
loo.fit(X[feats_to_loo], y)
X[feats_to_loo] = loo.transform(X[feats_to_loo])
X_test[feats_to_loo] = loo.transform(X_test[feats_to_loo])

## one_hotting
# X = pd.get_dummies(X)


# scaling
feats_to_scale = numerical_feats
sc = StandardScaler()
sc.fit(X[feats_to_scale])
X[feats_to_scale] = sc.transform(X[feats_to_scale])
X_test[feats_to_scale] = sc.transform(X_test[feats_to_scale])

In [12]:
X.shape

(219129, 33)

# Tuning Catboost

In [14]:
def objective_cat(trial, X, y):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 2000),
        "loss_function": trial.suggest_categorical("loss_function", ["CrossEntropy"]),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_loguniform("colsample_bylevel", 0.01, 1.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 50, 200)
    }
    # Conditional Hyper-Parameters
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    
    N_FOLDS = 8
    all_scores = np.zeros(N_FOLDS)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1337)

    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        cat_model = catboost.CatBoostClassifier(**param)
        cat_model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        
        y_preds = cat_model.predict_proba(X_val)[:, 1]
        all_scores[fold_id] = roc_auc_score(y_val, y_preds)
    
    auc = np.mean(all_scores)
    print(f"AVG CV AUC: \t {auc}")
    return auc

In [15]:
study_cat = optuna.create_study(direction="maximize", study_name="CatBoost Tuning")
func = lambda trial: objective_cat(trial, X, y)
study_cat.optimize(func, n_trials=100, show_progress_bar=True)

[32m[I 2023-01-30 17:56:38,539][0m A new study created in memory with name: CatBoost Tuning[0m


  0%|          | 0/100 [00:00<?, ?it/s]

AVG CV AUC: 	 0.8075797311669657
[32m[I 2023-01-30 18:00:23,770][0m Trial 0 finished with value: 0.8075797311669657 and parameters: {'n_estimators': 583, 'loss_function': 'CrossEntropy', 'learning_rate': 0.18040222719989238, 'l2_leaf_reg': 0.03186878527793749, 'colsample_bylevel': 0.019481488355564208, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'min_data_in_leaf': 15, 'one_hot_max_size': 4, 'early_stopping_rounds': 147, 'bagging_temperature': 5.46248767171551}. Best is trial 0 with value: 0.8075797311669657.[0m
AVG CV AUC: 	 0.8140868249879016
[32m[I 2023-01-30 18:01:25,530][0m Trial 1 finished with value: 0.8140868249879016 and parameters: {'n_estimators': 1412, 'loss_function': 'CrossEntropy', 'learning_rate': 0.11833324669151798, 'l2_leaf_reg': 0.06928566778103469, 'colsample_bylevel': 0.15111376024708675, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'min_data_in_leaf': 13, 'one_hot_max_size': 19, 'early_stopping_rounds': 150,

KeyboardInterrupt: 