In [None]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import optuna
from optuna_integration.catboost import CatBoostPruningCallback
# from optuna.integration import CatBoostPruningCallback
import plotly.express as px
from scipy.stats import mode
import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Lasso
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.preprocessing import MinMaxScaler

In [None]:
train = pd.read_csv('/data2/kdg_datasets/dacon_data/web/train.csv')
test = pd.read_csv('/data2/kdg_datasets/dacon_data/web/test.csv')

In [None]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']
test_x = test.drop(columns=['ID'])

In [None]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

In [None]:
cat_list=[]
num_list=[]
for i in train_x.columns:
    if train_x[i].dtypes=='O':
        cat_list.append(i)
    else:
        num_list.append(i)

In [None]:
scaler = MinMaxScaler()

train_x[num_list]=scaler.fit_transform(train_x[num_list])
test_x[num_list]=scaler.transform(test_x[num_list])

In [None]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [None]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 10000),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
        'depth': trial.suggest_int('depth', 4, 16),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 1e-1),
        'task_type': 'GPU',
        'used_ram_limit':'20gb',
        'devices': '3'
    }
    
    cat=CatBoostClassifier(**params)

    # pruning_callback = CatBoostPruningCallback(trial, "AUC")
    cat.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_test, y_test)],verbose=0)
    
    y_pred=cat.predict_proba(X_test)[:,1]
    score=roc_auc_score(y_test, y_pred)

    return score

In [None]:
study = optuna.create_study(study_name='CatBoostClassifier_Optimization', direction='maximize', sampler=TPESampler(seed=9608))
study.optimize(objective, n_trials=30, show_progress_bar=True)

print()
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)
best_params=study.best_trial.params

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=9608)
test_preds = np.zeros((X_test_encoded.shape[0], 5))

In [None]:
for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train_encoded, train_y)):
    X_train, X_valid = X_train_encoded.iloc[train_idx], X_train_encoded.iloc[valid_idx]
    y_train, y_valid = train_y.iloc[train_idx], train_y.iloc[valid_idx]

    cat = cat=CatBoostClassifier(**best_params,
                                task_type="GPU",
                                used_ram_limit='20gb',
                                 devices='3')

    cat.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_valid, y_valid)],verbose=1)
    
    test_preds[:, fold] = cat.predict_proba(X_test_encoded)[:, 1]
    pred_value=cat.predict_proba(X_test_encoded)[:, 1]
    pred_df=pd.DataFrame({'Click':pred_value})
    pred_df.to_csv(f'/home/kangdg22/meta_Assignment/dacon/web/cat_skf/{fold}.csv')

final_preds = test_preds.mean(axis=1)

In [None]:
sample_submission = pd.read_csv('/data2/kdg_datasets/dacon_data/web/sample_submission.csv')
sample_submission['Click'] = final_preds
sample_submission.to_csv('/home/kangdg22/meta_Assignment/dacon/web/cat_skf/cat_BestParam_MinMax_skf.csv', index=False)