In [2]:
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("../..")
from utils import *
import warnings
warnings.filterwarnings("ignore")
import os
import psutil

pd.options.display.max_columns=1000
pd.options.display.max_rows=1000

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
from xgboost import XGBClassifier
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


In [2]:
def objective(trial, t_X, t_y, v_X, v_y):

  param = {"n_estimators": trial.suggest_int("n_estimators:", 100, 1000),
           "max_depth": trial.suggest_int("max_depth", 6, 30),
           "subsample": trial.suggest_float("subsample", 0.3, 1.0),
           "learning_rate":  trial.suggest_float("learning_rate", 0.01, 0.3),
           'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
           'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
           'min_child_weight': trial.suggest_int('min_child_weight', 2, 50)}

  model = XGBClassifier(random_state=42,  tree_method= 'gpu_hist', **param)
  model.fit(t_X, t_y, eval_metric='auc')
  pred = model.predict_proba(v_X)
  score = roc_auc_score(v_y, pred[:, 1])

  return score

In [3]:
''' Seed '''
seed_everything(42)

''' Data Load '''
train, test, sample_submission = load_data()

X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

In [4]:
X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

KeyboardInterrupt: 

In [5]:
study = optuna.create_study(study_name='XGBClassifier', direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=15)

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[I 2024-05-31 12:29:44,122] A new study created in memory with name: XGBClassifier
[I 2024-05-31 12:56:53,150] Trial 0 finished with value: 0.7363605129115147 and parameters: {'n_estimators:': 437, 'max_depth': 29, 'subsample': 0.8123957592679836, 'learning_rate': 0.18361096041714062, 'lambda': 0.016445845403801215, 'alpha': 0.15683852581586644, 'min_child_weight': 4}. Best is trial 0 with value: 0.7363605129115147.
[I 2024-05-31 13:18:08,582] Trial 1 finished with value: 0.7676104952631982 and parameters: {'n_estimators:': 880, 'max_depth': 21, 'subsample': 0.7956508044572318, 'learning_rate': 0.01596950334578271, 'lambda': 0.09702107536403744, 'alpha': 0.8326101981596213, 'min_child_weight': 12}. Best is trial 1 with value: 0.7676104952631982.
[I 2024-05-31 13:18:55,483] Trial 2 finished with value: 0.764533330284878 and parameters: {'n_estimators:': 263, 'max_depth': 10, 'subsample': 0.5129695700716763, 'learning_rate': 0.16217936517334897, 'lambda': 0.04376255684556946, 'alpha': 0.

KeyboardInterrupt: 

### 하이퍼 파라미터 늘려서 최적화

In [3]:
''' Seed '''
seed_everything(42)

train_fre = pd.read_csv("/home/workspace/DACON/Click_predict/Model/LGBM_Experiment/train_fre.csv")
train_fre = reduce_mem_usage(train_fre)

''' Train / Vaild Split'''
X_train = train_fre.drop(columns = ["Click", "Unnamed: 0"])
y_train = train_fre["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

Memory usage of dataframe is 8947.91 MB
Memory usage after optimization is: 3901.07 MB
Decreased by 56.4%


In [4]:
def objective(trial, t_X, t_y, v_X, v_y):

  param = {"n_estimators": trial.suggest_int("n_estimators:", 100, 1000),
           "max_depth": trial.suggest_int("max_depth", 6, 30),
           "subsample": trial.suggest_float("subsample", 0.3, 1.0),
           "learning_rate":  trial.suggest_float("learning_rate", 0.01, 0.3),
           'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
           'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
           'min_child_weight': trial.suggest_int('min_child_weight', 2, 50)}

  model = XGBClassifier(random_state=42,  tree_method= 'gpu_hist', **param)
  model.fit(t_X, t_y, eval_metric='auc')
  pred = model.predict_proba(v_X)
  score = roc_auc_score(v_y, pred[:, 1])

  return score

In [5]:
study = optuna.create_study(study_name='XGBClassifier', direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=10)

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[I 2024-05-31 10:25:46,563] A new study created in memory with name: XGBClassifier
[I 2024-05-31 10:56:23,300] Trial 0 finished with value: 0.7511240118351469 and parameters: {'n_estimators:': 437, 'max_depth': 29, 'subsample': 0.8123957592679836, 'learning_rate': 0.18361096041714062, 'lambda': 0.016445845403801215, 'alpha': 0.15683852581586644, 'min_child_weight': 4}. Best is trial 0 with value: 0.7511240118351469.
Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x7f527f568070>>
Traceback (most recent call last):
  File "/root/anaconda3/envs/code_sim/lib/python3.9/site-packages/xgboost/core.py", line 589, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument
KeyboardInterrupt: 


: 

## 빈도 인코딩 정규화 한 것

In [6]:
study = optuna.create_study(study_name='XGBClassifier', direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=10)

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[I 2024-05-30 07:52:07,639] A new study created in memory with name: XGBClassifier
[I 2024-05-30 08:34:54,226] Trial 0 finished with value: 0.7562244864820428 and parameters: {'n_estimators:': 437, 'max_depth': 29, 'subsample': 0.8123957592679836, 'learning_rate': 0.18361096041714062}. Best is trial 0 with value: 0.7562244864820428.
[I 2024-05-30 08:35:32,360] Trial 1 finished with value: 0.7697606985162159 and parameters: {'n_estimators:': 240, 'max_depth': 9, 'subsample': 0.3406585285177396, 'learning_rate': 0.2611910822747312}. Best is trial 1 with value: 0.7697606985162159.
[I 2024-05-30 09:00:41,498] Trial 2 finished with value: 0.7062666529314808 and parameters: {'n_estimators:': 641, 'max_depth': 23, 'subsample': 0.3144091460070617, 'learning_rate': 0.29127385712697834}. Best is trial 1 with value: 0.7697606985162159.
[I 2024-05-30 09:02:52,521] Trial 3 finished with value: 0.7783820205453039 and parameters: {'n_estimators:': 850, 'max_depth': 11, 'subsample': 0.4272774770449704

KeyboardInterrupt: 

In [8]:
param = {'n_estimators:': 850, 
         'max_depth': 11, 
         'subsample': 0.42727747704497043, 
         'learning_rate': 0.06318730785749581}

X_train = train_fre.drop(columns = ["Click"]) 
y_train = train_fre["Click"]
test_fre.drop(columns = ['ID'], inplace = True)

model_1 = XGBClassifier(random_state=42, tree_method= 'gpu_hist', **param)
model_1.fit(X_train, y_train, eval_metric='auc')
pred = model_1.predict_proba(test_fre)

In [14]:
sample_submission['Click'] = pred[:, 1]

In [16]:
sample_submission.to_csv('xgboos_optuna.csv', index=False)

In [17]:
train_fre.to_csv('train_fre.csv')
test_fre.to_csv('test_fre.csv')

In [4]:
''' After Fre '''

---------------------------- Dataframe Size ----------------------------
num: 28605391
----------------------------   Dtype Size   ----------------------------
Type int64 : 29
Type float64 : 11


Unnamed: 0,Feature Name,Data Type,Nunique,NullValue,NullValue Ratio,value_1,value_2,value_3
0,Click,int64,2,0,0.0,1.0,0.0,0.0
1,F01,int64,2003,0,0.0,1.0,1130.0,5878230.0
2,F02,int64,2368,0,0.0,51.0,5894.0,5895750.0
3,F03,int64,63,0,0.0,2133223.0,554803.0,11351861.0
4,F04,float64,9979,0,0.0,114.0,26.0,119.0
5,F05,int64,1953,0,0.0,1.0,1130.0,5878230.0
6,F06,int64,8562,0,0.0,1.0,43.0,0.0
7,F07,int64,3445,0,0.0,5384798.0,8289.0,6867.0
8,F08,int64,68,0,0.0,6545602.0,2009250.0,1309718.0
9,F09,int64,3186,0,0.0,92638.0,207943.0,1114663.0
