In [1]:
# !pip install catboost
# !pip install optuna

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import catboost
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier,Pool
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score
import optuna
from optuna.samplers import TPESampler
%matplotlib inline

In [3]:
df= pd.read_csv('Top_20_df.csv')
df.shape

(335113, 210)

In [4]:
df= df.sample(50000)

In [5]:
# Splitting the dataset into X and y
X = df.drop('target', axis=1)  # Features: all columns except 'target'
y = df['target']

In [6]:
for column in X.columns:
    # Convert to string, including NaNs which become 'nan'
    X[column] = X[column].astype(str)

    # Optional: Replace 'nan' with a placeholder such as 'missing'
    X[column] = X[column].replace('nan', 'missing')

In [7]:
# Splitting dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_features = ['ContactMonth', 'ContactHour', 'os_112', 'os_110', 's_247_71',
       's_239_8', 's_247_75', 's_239_7', 's_248_74', 's_247_77', 's_239_9',
       's_241_68', 's_241_69', 's_247_78', 's_239_10', 's_247_73']

In [9]:
pool = Pool(X_train, y_train, cat_features=categorical_features, feature_names=list(X.columns))

**HYPERPARAMETER_TUNING**

In [10]:
def objective(trial):
    params = {
        "iterations":500,
        #"logging_level":'silent',
        #"verbose":0,
        "learning_rate": 0.01,
        "max_depth": trial.suggest_int("max_depth", 3,10),
        # "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20,100),
        "random_state": 42,
        # "subsample": trial.suggest_uniform('subsample', 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float('colsample_bylevel', 0.3, 1.0),
        "random_strength": trial.suggest_float("random_strength", 1, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.2, 5.0),
        "bootstrap_type": 'Bayesian',
        "eval_metric":'Logloss',
        "auto_class_weights":'Balanced'
    }

    model = catboost.CatBoostClassifier(**params, silent=True)
    # model.fit(X_train, y_train)
    model.fit(pool)
    # y_pred_test = model.predict_proba(X_test)
    y_pred_test = model.predict(X_test)
    f1 = f1_score(y_test, y_pred_test)
    return f1

sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=200)

print('Best hyperparameters:', study.best_params)
print('Best F1:', study.best_value)

[I 2024-04-09 05:04:18,687] A new study created in memory with name: no-name-074a41bf-d6c6-45e9-9393-9a5b3f450bfb
[I 2024-04-09 05:04:25,091] Trial 0 finished with value: 0.1814003370614371 and parameters: {'max_depth': 5, 'colsample_bylevel': 0.9655000144869412, 'random_strength': 7.587945476302646, 'bagging_temperature': 3.073560724145776}. Best is trial 0 with value: 0.1814003370614371.
[I 2024-04-09 05:04:30,746] Trial 1 finished with value: 0.18120496703970565 and parameters: {'max_depth': 4, 'colsample_bylevel': 0.40919616423534183, 'random_strength': 1.5227525095137953, 'bagging_temperature': 4.3576454997196885}. Best is trial 0 with value: 0.1814003370614371.
[I 2024-04-09 05:04:40,154] Trial 2 finished with value: 0.18339489320700178 and parameters: {'max_depth': 7, 'colsample_bylevel': 0.7956508044572318, 'random_strength': 1.185260448662222, 'bagging_temperature': 4.855567290377572}. Best is trial 2 with value: 0.18339489320700178.
[I 2024-04-09 05:04:50,669] Trial 3 finishe

Best hyperparameters: {'max_depth': 9, 'colsample_bylevel': 0.7270667916313567, 'random_strength': 1.6404702537843523, 'bagging_temperature': 4.538473007790868}
Best F1: 0.1852041688023236


In [11]:
#[I 2024-01-24 13:19:08,146] Trial 23 finished with value: 0.7698620649790703 and parameters: {'max_depth': 10, 'min_data_in_leaf': 85, 'colsample_bylevel': 0.7016150314426027, 'random_strength': 8.542169542677488, 'bagging_temperature': 8.681806510285387}. Best is trial 23 with value: 0.7698620649790703.

In [12]:
# Best hyperparameters: {'max_depth': 5, 'colsample_bylevel': 0.8574269294896714, 'random_strength': 4.887505167779041, 'bagging_temperature': 3.2666768318813983}
# Best F1: 0.9351504102964708
#Trial 0 finished with value: 0.9204353814269528 and parameters: {'max_depth': 5, 'colsample_bylevel': 0.9655000144869412, 'random_strength': 7.587945476302646, 'bagging_temperature': 3.073560724145776}. Best is trial 0 with value: 0.9204353814269528.