## Catboost best parameter | Optimization

In [4]:
%%time
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np

# Reading CSV files using pd
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s4e7/sample_submission.csv')

# Concatenating DataFrames using pd
test['Response'] = 0
full_df = pd.concat([train, test], axis=0)

# Printing shapes
print(train.shape)
print(test.shape)
print(full_df.shape)

(11504798, 12)
(7669866, 12)
(19174664, 12)
CPU times: user 24.2 s, sys: 4.58 s, total: 28.8 s
Wall time: 35.2 s


In [5]:
Categorical_feat1 = ['Previously_Insured']
Categorical_feat2 = ['Annual_Premium','Vehicle_Age','Vehicle_Damage','Vintage']
# features = [feat for feat in df.columns]

#Feature_Feature Encoding
for feat1 in Categorical_feat1:
    for feat2 in Categorical_feat2:
            full_df[feat1 + '_' + feat2] = full_df[feat1].astype(str) + '_' + full_df[feat2].astype(str)
            print([feat1 + '_' + feat2])

['Previously_Insured_Annual_Premium']
['Previously_Insured_Vehicle_Age']
['Previously_Insured_Vehicle_Damage']
['Previously_Insured_Vintage']


In [6]:
full_df.dtypes

id                                     int64
Gender                                object
Age                                    int64
Driving_License                        int64
Region_Code                          float64
Previously_Insured                     int64
Vehicle_Age                           object
Vehicle_Damage                        object
Annual_Premium                       float64
Policy_Sales_Channel                 float64
Vintage                                int64
Response                               int64
Previously_Insured_Annual_Premium     object
Previously_Insured_Vehicle_Age        object
Previously_Insured_Vehicle_Damage     object
Previously_Insured_Vintage            object
dtype: object

In [7]:
#Label Encode object columns
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for each in full_df.columns:
    if full_df[each].dtype == 'object':
        full_df[each] = le.fit_transform(full_df[each])

In [8]:
#changing float types to int32
for each in full_df.columns:
    if full_df[each].dtype == 'float64':
        full_df[each] = full_df[each].astype('int64')

In [9]:
full_df.dtypes

id                                   int64
Gender                               int64
Age                                  int64
Driving_License                      int64
Region_Code                          int64
Previously_Insured                   int64
Vehicle_Age                          int64
Vehicle_Damage                       int64
Annual_Premium                       int64
Policy_Sales_Channel                 int64
Vintage                              int64
Response                             int64
Previously_Insured_Annual_Premium    int64
Previously_Insured_Vehicle_Age       int64
Previously_Insured_Vehicle_Damage    int64
Previously_Insured_Vintage           int64
dtype: object

In [10]:
full_df.drop('id', inplace=True, axis=1)

In [18]:
train = full_df[:train.shape[0]] 
test = full_df[train.shape[0]:] 
test.drop('Response', axis=1, inplace=True)
print(train.shape)
print(test.shape)

del full_df

## Stratified KFold

In [20]:
from sklearn.model_selection import StratifiedKFold
train['kfold'] = -1
train = train.sample(frac=1).reset_index(drop=True)
y=train.Response.values
kf = StratifiedKFold(n_splits=5)

for k, (t_,v_) in enumerate(kf.split(train,y=y)):
    train.loc[v_,'kfold']=k

## Randomized Search CV

In [29]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import joblib
from scipy.stats import randint, uniform


preds = []
aucs = []

best_overall_auc = 0
best_model_filename = 'best_catboost_model.joblib'

def run(fold_):
    features = [c for c in train.columns if c not in ('Response', 'id', 'kfold')]
    df_train = train[train['kfold']==fold].reset_index(drop=True)
    df_valid = train[train['kfold']!=fold].reset_index(drop=True)

    X_train = df_train[features]
    y_train = df_train['Response']
    X_valid = df_valid[features]
    y_valid = df_valid['Response']

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)

    params = {'learning_rate': uniform(0.01, 0.3),
        'depth': randint(4, 10),
        'l2_leaf_reg': uniform(1, 10),
        'iterations': randint(1000, 5000)
        }
    
    model = CatBoostClassifier(
        early_stopping_rounds=200,
        use_best_model=True,
        eval_metric='AUC',
        task_type='GPU',  # Use GPU for training
        devices='all',  # Specify GPU device (if multiple GPUs are available)
        verbose=0
    )

    random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=100,
        cv=3, verbose=1, n_jobs=-1, random_state=42, scoring='roc_auc')
    
    random_search.fit(X_train, y_train, cat_features=features)
    best_model = random_search.best_estimator_

    valid_preds = best_model.predict_proba(X_valid)[:,1]
    auc = roc_auc_score(y_valid, valid_preds)
    
    print(f"Fold {fold}, Best AUC: {random_search.best_score_}, AUC: {auc}")
    print(f"Best parameters: {random_search.best_params_}")
    
    if auc > best_overall_auc:
        best_overall_auc = auc
        joblib.dump(best_model, best_model_filename)
        print(f"New best model found and saved with AUC: {best_overall_auc}")

    preds.append(valid_preds)
    aucs.append(auc)

for fold in range(5):
    run(fold)

print(f"\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}")


## Optimization

In [None]:
print("________________________________OPTIMIZATION____________________________")
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
import numpy as np
import joblib

preds = []
aucs = []

best_overall_auc = 0
best_model_filename = 'best_catboost_model.joblib'

# Load the best model from the previous optimization
best_model = joblib.load('previous_best_catboost_model.joblib')

# Prepare the test data once
features = [c for c in train.columns if c not in ('Response', 'id', 'kfold')]
X_test = test[features].values
X_test_pool = Pool(X_test, cat_features=features)

def run(fold):
    global best_overall_auc
    df_train = train[train['kfold'] != fold].reset_index(drop=True)
    df_valid = train[train['kfold'] == fold].reset_index(drop=True)

    X_train = df_train[features]
    y_train = df_train['Response']
    X_valid = df_valid[features]
    y_valid = df_valid['Response']
    X_test  = test[features]

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)
    X_test_pool =  Pool(X_test, cat_features=X_test.columns.values)

    # Create a new model with the same parameters as the best model
    model = CatBoostClassifier(
        **best_model.get_params(),
        iterations=5000,  # Set to 5000 iterations
        early_stopping_rounds=200,
        use_best_model=True,
        eval_metric='AUC',
        task_type='GPU',
        devices='0:1',  # Use both GPUs. Adjust if needed.
        verbose=100
    )

    model.fit(X_train_pool, eval_set=X_valid_pool)

    valid_preds = model.predict_proba(X_valid_pool)[:, 1]
    
    auc = roc_auc_score(y_valid, valid_preds)
    
    print(f"Fold {fold}, AUC: {auc:.5f}")
    
    if auc > best_overall_auc:
        best_overall_auc = auc
        joblib.dump(model, best_model_filename)
        print(f"New best model found and saved with AUC: {best_overall_auc:.5f}")

    aucs.append(auc)

for fold in range(5):
    run(fold)

print(f"\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}")

# Load the best model from this round of optimization
best_model = joblib.load(best_model_filename)

# Make predictions on the entire test set using the best model
final_preds = best_model.predict_proba(X_test_pool)[:, 1]

# You can now use final_preds for your submission or further processing

## Submission

In [None]:
sample_submission['Response'] = final_preds
sample_submission.to_csv('ROR-Final_Submission.csv')