In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import catboost as cat
import optuna  # pip install optuna
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from joblib import dump, load

# Get Data

In [2]:
train = pd.read_csv("Data/train.csv", index_col = 0)
train.head()

Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [3]:
X = train.drop("claim", axis = 1).copy()
y = train.claim.copy()

In [4]:
y.value_counts(normalize = True)

0    0.501508
1    0.498492
Name: claim, dtype: float64

In [5]:
test_data = pd.read_csv("Data/test.csv", index_col = 0)
sample_submission = pd.read_csv("Data/sample_solution.csv", index_col = 0)

# Train and test default LightGBM model

In [6]:
model = cat.CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        random_seed=42
    ) 

cross_validation_score = cross_validate(model, X, y, scoring = 'roc_auc', cv = 5, n_jobs = 1, 
                                        return_train_score = True)#, return_estimator = True)

Learning rate set to 0.023395
0:	learn: 0.5451393	total: 76.3ms	remaining: 1m 16s
1:	learn: 0.5831696	total: 132ms	remaining: 1m 6s
2:	learn: 0.6102219	total: 180ms	remaining: 59.8s
3:	learn: 0.6378802	total: 229ms	remaining: 57s
4:	learn: 0.6579940	total: 278ms	remaining: 55.3s
5:	learn: 0.6757026	total: 330ms	remaining: 54.6s
6:	learn: 0.6851927	total: 378ms	remaining: 53.6s
7:	learn: 0.6939461	total: 428ms	remaining: 53s
8:	learn: 0.7075830	total: 479ms	remaining: 52.7s
9:	learn: 0.7171909	total: 530ms	remaining: 52.5s
10:	learn: 0.7276913	total: 577ms	remaining: 51.9s
11:	learn: 0.7333614	total: 630ms	remaining: 51.9s
12:	learn: 0.7386228	total: 675ms	remaining: 51.3s
13:	learn: 0.7468977	total: 725ms	remaining: 51.1s
14:	learn: 0.7526853	total: 770ms	remaining: 50.6s
15:	learn: 0.7591578	total: 820ms	remaining: 50.4s
16:	learn: 0.7627712	total: 873ms	remaining: 50.5s
17:	learn: 0.7680428	total: 921ms	remaining: 50.2s
18:	learn: 0.7719139	total: 969ms	remaining: 50s
19:	learn: 0.77

In [7]:
print("Mean AUC on training set:", np.mean(cross_validation_score['train_score']))
print("Mean AUC on test set:", np.mean(cross_validation_score['test_score']))

Mean AUC on training set: 0.8025785150071292
Mean AUC on test set: 0.7998693541439318


In [8]:
# Train on the whole training set and get predictions on the test set
model = cat.CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        random_seed=42
    ) 
model.fit(X, y)
dump(model, 'Models/default_catboost_model.joblib') 

Learning rate set to 0.02311
0:	learn: 0.5450465	total: 55ms	remaining: 55s
1:	learn: 0.5828141	total: 129ms	remaining: 1m 4s
2:	learn: 0.6097153	total: 185ms	remaining: 1m 1s
3:	learn: 0.6376719	total: 244ms	remaining: 1m
4:	learn: 0.6578241	total: 301ms	remaining: 60s
5:	learn: 0.6723561	total: 369ms	remaining: 1m 1s
6:	learn: 0.6882419	total: 427ms	remaining: 1m
7:	learn: 0.6995759	total: 487ms	remaining: 1m
8:	learn: 0.7122173	total: 553ms	remaining: 1m
9:	learn: 0.7234676	total: 634ms	remaining: 1m 2s
10:	learn: 0.7333820	total: 690ms	remaining: 1m 2s
11:	learn: 0.7369136	total: 746ms	remaining: 1m 1s
12:	learn: 0.7450155	total: 805ms	remaining: 1m 1s
13:	learn: 0.7523359	total: 876ms	remaining: 1m 1s
14:	learn: 0.7576532	total: 936ms	remaining: 1m 1s
15:	learn: 0.7613619	total: 1s	remaining: 1m 1s
16:	learn: 0.7648613	total: 1.08s	remaining: 1m 2s
17:	learn: 0.7679999	total: 1.14s	remaining: 1m 2s
18:	learn: 0.7699841	total: 1.2s	remaining: 1m 1s
19:	learn: 0.7737823	total: 1.25s

['Models/default_catboost_model.joblib']

In [9]:
y_pred = model.predict_proba(test_data)
sample_submission.claim = y_pred[:, 1]
sample_submission.to_csv("Submissions/default_catboost_model.csv")

# Add Feature including number of nans per sample

In [None]:
def feature_generation(data):

    # add feature with number of na's per row + standard deviation
    features = [col for col in data.columns if 'f' in col] # if 'f' is in the name
    data['count_na'] = data[features].isna().sum(axis=1)

    features += ['count_na']

    # now do imputation with mean
    data[features] = data[features].fillna(data[features].mean())

    return data

In [None]:
X_w_nan = feature_generation(X)
test_data_w_nan = feature_generation(test_data)

In [None]:
model = cat.CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        random_seed=42
    ) 
model.fit(X_w_nan, y)
dump(model, 'Models/default_catboost_model_w_nan_count.joblib') 

In [None]:
y_pred = model.predict_proba(test_data_w_nan)
sample_submission.claim = y_pred[:, 1]
sample_submission.to_csv("Submissions/default_catboost_model_w_nan_count.csv")

In [None]:
feature_importance = pd.DataFrame(columns = ["Feature", "Importance"])
feature_importance.Feature = X_w_nan.columns
feature_importance.Importance = model.feature_importances_
feature_importance = feature_importance.sort_values("Importance", ascending = False)

_ = sns.barplot(data = feature_importance.iloc[:20], x = 'Importance', y = 'Feature')
plt.title("Top 20 most important features")

# Hyperparameter Tuning using Optuna

In [None]:
from optuna.integration import LightGBMPruningCallback
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


%%time
def objective(trial, Xy, ):
    train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=int(SEED), shuffle=True)
    train_pool = Pool(train_x, train_y)
    test_pool = Pool(test_x, test_y)
    
    # Parameters
    params = {
        'iterations' : trial.suggest_int('iterations', 50, 300),                         
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),               
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'learning_rate' :trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])
    }
    # Learning
    model = cat.CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=42,
        border_count=64,
        **params
    )        
    model.fit(train_pool)
    # Predict
    preds = model.predict(test_pool)
    pred_labels = np.rint(preds)
    y_pred_boot = resample(pred_labels, n_samples = len(train_y))
    # Evaluation
    ROC_AUC_Score = roc_auc_score(train_y, y_pred_boot)
    print('ROC AUC Score of CatBoost =', ROC_AUC_Score)
    return ROC_AUC_Score

In [None]:
%%time
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(SEED)))
func = lambda trial: objective(trial, X_w_nan, y)
study.optimize(func, n_trials = 100, n_jobs = multiprocessing.cpu_count())

In [None]:
print(f"\tBest value (AUC): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

In [None]:
param_grid = {
        "device_type": 'gpu',
        "n_estimators": 9590,
        "learning_rate": 0.18227838793671425,
        "num_leaves": 760,
        "max_depth": 5,
        "min_data_in_leaf": 700,
        "lambda_l1": 10,
        "lambda_l2": 25,
        "min_gain_to_split": 5.782744082905875,
        "bagging_fraction": 0.6,
        "bagging_freq": 1,
        "feature_fraction": 0.9
    }


In [None]:
# Train model with best hyperparameters
model = lgbm.LGBMClassifier(objective="binary")
model.set_params(**study.best_params)
model.fit(X_w_nan, y)
dump(model, 'Models/default_lgbm_model_w_nan_count_tuned2.joblib') 

In [None]:
y_pred = model.predict_proba(test_data_w_nan)
sample_submission.claim = y_pred[:, 1]
sample_submission.to_csv("Submissions/default_lgbm_model_w_nan_count_tuned2.csv")

In [None]:
feature_importance = pd.DataFrame(columns = ["Feature", "Importance"])
feature_importance.Feature = X_w_nan.columns
feature_importance.Importance = model.feature_importances_
feature_importance = feature_importance.sort_values("Importance", ascending = False)

_ = sns.barplot(data = feature_importance.iloc[:20], x = 'Importance', y = 'Feature')
plt.title("Top 20 most important features")

In [None]:
feature_importance = pd.DataFrame(columns = ["Feature", "Importance"])
feature_importance.Feature = X_w_nan.columns
feature_importance.Importance = model.feature_importances_
feature_importance = feature_importance.sort_values("Importance", ascending = False)

plt.figure(figsize=(10, 30))
_ = sns.barplot(data = feature_importance, x = 'Importance', y = 'Feature')
plt.title("Top 20 most important features")