# Optuna optimization

In [4]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, log_loss, roc_auc_score
import optuna
import os
import json
import pandas as pd
import scipy as sp
from sklearn.preprocessing import LabelEncoder 
from matplotlib import pyplot as plt

from tqdm.auto import tqdm
import numpy as np
import datetime
import pickle
ASSAF_STORAGE = '/sise/assafzar-group/assafzar/mark/taboola-competition/'

In [6]:
RANDOM_SEED = 203398029
EARLY_STOPPING_ROUND = 100

with open('optuna_5_days_X-train_Y-train_X-Test_dict.pickle', 'rb') as f:
    data = pickle.load(f)
    
cat_features = [0, 1, 4, 5, 9, 10, 11, 12, 13, 14, 15, 18, 19, 24, 25]

X = data['X_train']
y = data['y_train']
X_test = data['X_test']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.1, random_state=RANDOM_SEED)

def objective(trial):
    param = {'loss_function' : 'Logloss',}
    param['learning_rate'] = trial.suggest_float("learning_rate", 0.007, 0.027,step= 0.002)
    param['depth'] = trial.suggest_int('depth', 4, 12)
    param['l2_leaf_reg'] = trial.suggest_float('l2_leaf_reg', 1.0, 5.5,step= 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 10000
    param['use_best_model'] = True
    param['loss_function'] ='Logloss'
    param['eval_metric'] ='Logloss'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'
    
    clf = CatBoostClassifier(task_type="GPU",
                             devices='0',                   
                             cat_features = cat_features,
                           **param)

    clf.fit(X_train.copy(), y_train.copy(),
                  eval_set=[(X_eval.copy(), y_eval.copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND,
)
    loss = log_loss(y_valid, clf.predict(X_valid.copy()))
    return loss

In [7]:
study = optuna.create_study(study_name=f'catboost-freq-seed{RANDOM_SEED}')
study.optimize(objective, n_trials=10000, n_jobs=1, timeout=50000)
study_df  =study.trials_dataframe(attrs=("number", "value", "params", "state"))
study_df.to_csv('optuna_trials_log_freq_features.csv')


[I 2024-02-27 16:11:08,127] A new study created in memory with name: catboost-freq-seed203398029
[I 2024-02-27 16:38:43,827] Trial 0 finished with value: 7.982287443118131 and parameters: {'learning_rate': 0.023, 'depth': 5, 'l2_leaf_reg': 4.5, 'min_child_samples': 32}. Best is trial 0 with value: 7.982287443118131.
[I 2024-02-27 17:19:27,806] Trial 1 finished with value: 7.947583710097663 and parameters: {'learning_rate': 0.011, 'depth': 12, 'l2_leaf_reg': 3.5, 'min_child_samples': 1}. Best is trial 1 with value: 7.947583710097663.
[I 2024-02-27 17:58:54,050] Trial 2 finished with value: 7.9447137245722095 and parameters: {'learning_rate': 0.009000000000000001, 'depth': 12, 'l2_leaf_reg': 3.0, 'min_child_samples': 4}. Best is trial 2 with value: 7.9447137245722095.
[I 2024-02-27 18:26:47,999] Trial 3 finished with value: 7.989313959404591 and parameters: {'learning_rate': 0.015, 'depth': 5, 'l2_leaf_reg': 3.0, 'min_child_samples': 8}. Best is trial 2 with value: 7.9447137245722095.
[I

In [8]:

optimized_clf = CatBoostClassifier(learning_rate=study.best_params['learning_rate'],
                                        depth=study.best_params['depth'],
                                        l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        min_child_samples=study.best_params['min_child_samples'],
                                        grow_policy='Depthwise',
                                        iterations=10000,
                                        use_best_model=True,
                                        loss_function = 'Logloss',
                                        eval_metric='Logloss',
                                        od_type='iter',
                                        od_wait=20,
                                        random_state=RANDOM_SEED,
                                        task_type="GPU",
                                        devices='0',
                                        cat_features = cat_features)
optimized_clf.fit(X_train.copy(), y_train.copy(),
                        eval_set=[(X_eval.copy(), y_eval.copy())],
                        early_stopping_rounds=EARLY_STOPPING_ROUND)


pred_train = optimized_clf.predict(X_train.copy())
pred_eval = optimized_clf.predict(X_eval.copy())
pred_valid = optimized_clf.predict(X_valid.copy())

print(f'Train AUC : {roc_auc_score(pred_train,y_train)}')
print(f'Eval AUC : {roc_auc_score(pred_eval,y_eval)}')
print(f'Validation AUC : {roc_auc_score(pred_valid,y_valid)}')


0:	learn: 0.6883068	test: 0.6883190	best: 0.6883190 (0)	total: 665ms	remaining: 1h 50m 49s
1:	learn: 0.6835606	test: 0.6835865	best: 0.6835865 (1)	total: 1.32s	remaining: 1h 50m 13s
2:	learn: 0.6789336	test: 0.6789695	best: 0.6789695 (2)	total: 1.99s	remaining: 1h 50m 26s
3:	learn: 0.6744083	test: 0.6744534	best: 0.6744534 (3)	total: 2.65s	remaining: 1h 50m 33s
4:	learn: 0.6700055	test: 0.6700577	best: 0.6700577 (4)	total: 3.32s	remaining: 1h 50m 45s
5:	learn: 0.6656967	test: 0.6657561	best: 0.6657561 (5)	total: 3.99s	remaining: 1h 50m 42s
6:	learn: 0.6614877	test: 0.6615598	best: 0.6615598 (6)	total: 4.65s	remaining: 1h 50m 37s
7:	learn: 0.6573812	test: 0.6574624	best: 0.6574624 (7)	total: 5.31s	remaining: 1h 50m 36s
8:	learn: 0.6533725	test: 0.6534645	best: 0.6534645 (8)	total: 5.97s	remaining: 1h 50m 33s
9:	learn: 0.6494522	test: 0.6495513	best: 0.6495513 (9)	total: 6.64s	remaining: 1h 50m 32s
10:	learn: 0.6456093	test: 0.6457213	best: 0.6457213 (10)	total: 7.3s	remaining: 1h 50m 31

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [9]:
preds_proba = optimized_clf.predict_proba(X_test)
def report_submission():
    pred = preds_proba[:,1]
    pred_df = pd.DataFrame(pred)
    pred_df.reset_index(inplace=True)
    pred_df.columns = ['Id','Predicted']
    pred_df.to_csv('OPTUNA_CATBOOST_5-days-with-freq-features.csv',index=False)
    
report_submission()

In [14]:
optimized_clf.get_params()

{'iterations': 10000,
 'learning_rate': 0.009000000000000001,
 'depth': 12,
 'l2_leaf_reg': 3.0,
 'loss_function': 'Logloss',
 'od_wait': 20,
 'od_type': 'iter',
 'use_best_model': True,
 'eval_metric': 'Logloss',
 'task_type': 'GPU',
 'devices': '0',
 'random_state': 203398029,
 'cat_features': [0, 1, 4, 5, 9, 10, 11, 12, 13, 14, 15, 18, 19, 24, 25],
 'grow_policy': 'Depthwise',
 'min_child_samples': 4}

In [15]:

optimized_clf2 = CatBoostClassifier(learning_rate=study.best_params['learning_rate'],
                                        depth=study.best_params['depth'],
                                        l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        min_child_samples=study.best_params['min_child_samples'],
                                        grow_policy='Depthwise',
                                        iterations=6000,
                                        loss_function = 'Logloss',
                                        eval_metric='Logloss',
                                        random_state=RANDOM_SEED,
                                        task_type="GPU",
                                        devices='0',
                                        cat_features = cat_features)
optimized_clf2.fit(X.copy(), y.copy())

preds_proba = optimized_clf.predict_proba(X_test)
def report_submission():
    pred = preds_proba[:,1]
    pred_df = pd.DataFrame(pred)
    pred_df.reset_index(inplace=True)
    pred_df.columns = ['Id','Predicted']
    pred_df.to_csv('OPTUNA_CATBOOST_5-days-with-freq-features_all_data_6k.csv',index=False)
    
report_submission()

0:	learn: 0.6882907	total: 840ms	remaining: 1h 23m 58s
1:	learn: 0.6835450	total: 1.64s	remaining: 1h 22m 4s
2:	learn: 0.6789081	total: 2.46s	remaining: 1h 21m 54s
3:	learn: 0.6743846	total: 3.27s	remaining: 1h 21m 49s
4:	learn: 0.6699621	total: 4.09s	remaining: 1h 21m 42s
5:	learn: 0.6656430	total: 4.91s	remaining: 1h 21m 48s
6:	learn: 0.6614215	total: 5.74s	remaining: 1h 21m 58s
7:	learn: 0.6573031	total: 6.57s	remaining: 1h 22m 2s
8:	learn: 0.6532850	total: 7.39s	remaining: 1h 22m 1s
9:	learn: 0.6493573	total: 8.22s	remaining: 1h 22m 3s
10:	learn: 0.6455187	total: 9.04s	remaining: 1h 22m 1s
11:	learn: 0.6417708	total: 9.86s	remaining: 1h 22m 1s
12:	learn: 0.6381072	total: 10.7s	remaining: 1h 21m 57s
13:	learn: 0.6345558	total: 11.5s	remaining: 1h 21m 54s
14:	learn: 0.6310607	total: 12.3s	remaining: 1h 21m 54s
15:	learn: 0.6276539	total: 13.1s	remaining: 1h 21m 52s
16:	learn: 0.6243111	total: 14s	remaining: 1h 21m 51s
17:	learn: 0.6210659	total: 14.8s	remaining: 1h 21m 47s
18:	learn:

In [17]:
optimized_clf2 = CatBoostClassifier(learning_rate=study.best_params['learning_rate'],
                                        depth=study.best_params['depth'],
                                        l2_leaf_reg=study.best_params['l2_leaf_reg'],
                                        min_child_samples=study.best_params['min_child_samples'],
                                        grow_policy='Depthwise',
                                        iterations=7000,
                                        loss_function = 'Logloss',
                                        eval_metric='Logloss',
                                        random_state=RANDOM_SEED,
                                        task_type="GPU",
                                        devices='0',
                                        cat_features = cat_features)
optimized_clf2.fit(X.copy(), y.copy())

preds_proba = optimized_clf.predict_proba(X_test)
def report_submission():
    pred = preds_proba[:,1]
    pred_df = pd.DataFrame(pred)
    pred_df.reset_index(inplace=True)
    pred_df.columns = ['Id','Predicted']
    pred_df.to_csv('OPTUNA_CATBOOST_5-days-with-freq-features_all_data_7k.csv',index=False)
    
report_submission()

0:	learn: 0.6882908	total: 834ms	remaining: 1h 37m 16s
1:	learn: 0.6835450	total: 1.64s	remaining: 1h 35m 37s
2:	learn: 0.6789080	total: 2.46s	remaining: 1h 35m 40s
3:	learn: 0.6743846	total: 3.3s	remaining: 1h 36m 20s
4:	learn: 0.6699620	total: 4.15s	remaining: 1h 36m 46s
5:	learn: 0.6656431	total: 4.97s	remaining: 1h 36m 37s
6:	learn: 0.6614216	total: 5.8s	remaining: 1h 36m 35s
7:	learn: 0.6573032	total: 6.63s	remaining: 1h 36m 34s
8:	learn: 0.6532801	total: 7.45s	remaining: 1h 36m 26s
9:	learn: 0.6493519	total: 8.28s	remaining: 1h 36m 24s
10:	learn: 0.6455220	total: 9.1s	remaining: 1h 36m 21s
11:	learn: 0.6417837	total: 9.92s	remaining: 1h 36m 15s
12:	learn: 0.6381182	total: 10.7s	remaining: 1h 36m 13s
13:	learn: 0.6345640	total: 11.6s	remaining: 1h 36m 10s
14:	learn: 0.6310579	total: 12.4s	remaining: 1h 36m 7s
15:	learn: 0.6276500	total: 13.2s	remaining: 1h 36m 7s
16:	learn: 0.6243251	total: 14s	remaining: 1h 36m 3s
17:	learn: 0.6210708	total: 14.8s	remaining: 1h 35m 59s
18:	learn: