In [None]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import optuna
# from optuna.integration import XGBoostPruningCallback
import plotly.express as px
from scipy.stats import mode
import category_encoders as ce
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import Lasso
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import lightgbm as lgb

In [None]:
train = pd.read_csv('/data2/kdg_datasets/dacon_data/web/train.csv')
test = pd.read_csv('/data2/kdg_datasets/dacon_data/web/test.csv')

In [None]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']
test_x = test.drop(columns=['ID'])

In [None]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

In [None]:
cat_list=[]
num_list=[]
for i in train_x.columns:
    if train_x[i].dtypes=='O':
        cat_list.append(i)
    else:
        num_list.append(i)

In [None]:
scaler = MinMaxScaler()

train_x[num_list]=scaler.fit_transform(train_x[num_list])
test_x[num_list]=scaler.transform(test_x[num_list])

In [None]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train_encoded, train_y, test_size=0.15, random_state=9608)

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        ' min_split_gain':1,
        'objective': 'binary',
        'metric': 'auc',
        'device': 'gpu',  # Use GPU for training
    }


    lgb_model = LGBMClassifier(**params)

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[lgb.early_stopping(stopping_rounds=30), pruning_callback, lgb.log_evaluation(period = 200)])


    preds = lgb_model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)

    return score

In [None]:
study = optuna.create_study(study_name='LGBMClassifier_Optimization', direction='maximize', sampler=TPESampler(seed=9608), pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, n_trials=5, show_progress_bar=True)

print()
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)
best_params=study.best_trial.params

In [None]:
params = {
    **best_params,
    'objective': 'binary',
        'metric': 'auc',
        'device': 'gpu',
}

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=9608)

In [None]:
test_preds = np.zeros((X_test_encoded.shape[0], 5))

In [None]:
for fold, (train_idx, valid_idx) in tqdm(enumerate(skf.split(X_train_encoded, train_y))):
    X_train, X_valid = X_train_encoded.iloc[train_idx], X_train_encoded.iloc[valid_idx]
    y_train, y_valid = train_y.iloc[train_idx], train_y.iloc[valid_idx]


    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period = 200)])
    
    
    test_preds[:, fold] = lgb_model.predict_proba(X_test_encoded)[:, 1]
    pred_value=lgb_model.predict_proba(X_test_encoded)[:, 1]
    pred_df=pd.DataFrame({'Click':pred_value})
    pred_df.to_csv(f'/home/kangdg22/meta_Assignment/dacon/web/lgbm/{fold}.csv')

final_preds = test_preds.mean(axis=1)

In [None]:
sample_submission = pd.read_csv('/data2/kdg_datasets/dacon_data/web/sample_submission.csv')
sample_submission['Click'] = final_preds
sample_submission.to_csv('/home/kangdg22/meta_Assignment/dacon/web/lgbm/lgbm_optuna_MinMax_SKF.csv', index=False)