In [1]:
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("../..")
from utils import *
import warnings
warnings.filterwarnings("ignore")
import os
import psutil

pd.options.display.max_columns=1000
pd.options.display.max_rows=1000

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


In [5]:
def objective(trial, t_X, t_y, v_X, v_y):
  param = {"n_estimators": trial.suggest_int("n_estimators:", 1000, 3500),
           "max_depth": trial.suggest_int("max_depth", 15, 40),
           "num_leaves": trial.suggest_int("num_leaves", 2, 500),
           "subsample": trial.suggest_float("subsample", 0.3, 1.0),
           "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
           "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),}

  model = lgb.LGBMClassifier(random_state=42, **param)

  model.fit(t_X, t_y, eval_metric='auc')
  pred = model.predict_proba(v_X)
  score = roc_auc_score(v_y, pred[:, 1])

  return score

In [3]:
''' Seed '''
seed_everything(42)

''' Data Load '''
train, test, sample_submission = load_data()

X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

Feature Selection
Start Frequency
Missing Value
---------------- Start MissingValue ----------------
Memory usage of dataframe is 7856.71 MB
Memory usage after optimization is: 3240.89 MB
Decreased by 58.8%
Memory usage of dataframe is 872.97 MB
Memory usage after optimization is: 360.10 MB
Decreased by 58.8%
Memory usage of dataframe is 1385.05 MB
Memory usage after optimization is: 562.68 MB
Decreased by 59.4%


In [6]:
study = optuna.create_study(study_name='LGBMClassifier', direction='maximize', sampler=TPESampler(seed=42))
study.optimize(lambda trial: objective(trial, X_train, y_train, X_valid, y_valid), n_trials=15)

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[I 2024-06-02 04:22:46,135] A new study created in memory with name: LGBMClassifier
[I 2024-06-02 04:42:11,168] Trial 0 finished with value: 0.7822841488253525 and parameters: {'n_estimators:': 1936, 'max_depth': 39, 'num_leaves': 367, 'subsample': 0.7190609389379257, 'subsample_freq': 2, 'min_child_samples': 35}. Best is trial 0 with value: 0.7822841488253525.
[I 2024-06-02 05:02:01,719] Trial 1 finished with value: 0.7823709223352215 and parameters: {'n_estimators:': 1145, 'max_depth': 37, 'num_leaves': 301, 'subsample': 0.7956508044572318, 'subsample_freq': 1, 'min_child_samples': 195}. Best is trial 1 with value: 0.7823709223352215.
[I 2024-06-02 05:22:00,830] Trial 2 finished with value: 0.7804707955500804 and parameters: {'n_estimators:': 3081, 'max_depth': 20, 'num_leaves': 92, 'subsample': 0.4283831568974037, 'subsample_freq': 4, 'min_child_samples': 107}. Best is trial 1 with value: 0.7823709223352215.
[I 2024-06-02 05:36:12,512] Trial 3 finished with value: 0.7799230896911223

KeyboardInterrupt: 

In [11]:
X_train.to_csv('Count_target_X_train.csv', index=False)
X_valid.to_csv('Count_target_X_valid.csv', index=False)
y_train.to_csv('Count_target_y_train.csv', index=False)
y_valid.to_csv('Count_target_y_valid.csv', index=False)
test.to_csv('Count_target_test.csv', index=False)

## F17 Group_feature_mean_Optimization

In [14]:
X_train['F17'] = str(X_train['F17'])
X_valid['F17'] = str(X_valid['F17'])

In [15]:
X_train = groupby_mean(X_train, 'F17', X_train.columns.drop('F17'))
X_valid = groupby_mean(X_valid, 'F17', X_train.columns.drop('F17'))
X_train.head()

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39,F17_F01_mean,F17_F02_mean,F17_F03_mean,F17_F04_mean,F17_F05_mean,F17_F06_mean,F17_F07_mean,F17_F08_mean,F17_F09_mean,F17_F10_mean,F17_F11_mean,F17_F12_mean,F17_F13_mean,F17_F14_mean,F17_F15_mean,F17_F16_mean,F17_F18_mean,F17_F19_mean,F17_F20_mean,F17_F21_mean,F17_F22_mean,F17_F23_mean,F17_F24_mean,F17_F25_mean,F17_F26_mean,F17_F27_mean,F17_F28_mean,F17_F29_mean,F17_F30_mean,F17_F31_mean,F17_F32_mean,F17_F33_mean,F17_F34_mean,F17_F35_mean,F17_F36_mean,F17_F37_mean,F17_F38_mean,F17_F39_mean
12066756,24,39,0.169434,46.0,24,0,1033,0.175781,5812,24,514.0,24,9051,20,0.167969,0.258057,12066756 0.239624\n5255865 0.199585\n13...,2.0,1.0,27707,8191097,7577062,4311788,6.0,7094,10062514,11.0,0.186523,5.0,1190,0.203613,0.0,1.0,24,22758893,1.0,9051,0.0,7094,1350092.0,1448610.0,0.194722,24.799086,1347841.0,117.200415,924512.716722,0.194722,132641.919784,1362590.0,364.200378,1350523.0,586614.577554,9.628967,0.194733,0.194728,5.089473,0.311931,4624656.0,4732334.0,3012621.0,7456908.0,78.429916,47254.617938,7544565.0,15.548887,0.19469,2.77794,1625507.0,0.1947,19277.146484,1.830111,1351196.0,20465360.0,6.074394,214385.97745,0.180465,60931.952706
5255865,2211867,2369136,0.218872,0.0,2211863,4,60483,0.215942,233449,2211867,13.0,2211867,1635381,3,0.218872,0.319092,12066756 0.239624\n5255865 0.199585\n13...,3.0,0.0,9490820,8191097,7577062,12862549,7.0,227635,9490820,7.0,0.199707,1.0,2526,0.203613,5152.0,0.0,2211867,22758893,3.0,238589,0.0,558317,1350092.0,1448610.0,0.194722,24.799086,1347841.0,117.200415,924512.716722,0.194722,132641.919784,1362590.0,364.200378,1350523.0,586614.577554,9.628967,0.194733,0.194728,5.089473,0.311931,4624656.0,4732334.0,3012621.0,7456908.0,78.429916,47254.617938,7544565.0,15.548887,0.19469,2.77794,1625507.0,0.1947,19277.146484,1.830111,1351196.0,20465360.0,6.074394,214385.97745,0.180465,60931.952706
13191452,1111326,1111326,0.187256,5.0,1111326,29,2270,0.215942,8186,1111326,266.0,1111326,25173,4,0.223511,0.187256,12066756 0.239624\n5255865 0.199585\n13...,6.0,1.0,436781,8191097,96765,12862549,4.0,3539,1628146,1.0,0.188721,1.0,377455,0.318359,23.0,3.0,1111326,22758893,0.0,25173,0.0,3539,1350092.0,1448610.0,0.194722,24.799086,1347841.0,117.200415,924512.716722,0.194722,132641.919784,1362590.0,364.200378,1350523.0,586614.577554,9.628967,0.194733,0.194728,5.089473,0.311931,4624656.0,4732334.0,3012621.0,7456908.0,78.429916,47254.617938,7544565.0,15.548887,0.19469,2.77794,1625507.0,0.1947,19277.146484,1.830111,1351196.0,20465360.0,6.074394,214385.97745,0.180465,60931.952706
24944338,2211867,2369136,0.218872,0.0,2211863,656,10499,0.242798,7501,2211867,163.0,2211867,736398,5,0.218872,0.104492,12066756 0.239624\n5255865 0.199585\n13...,1.0,0.0,9490820,8191097,3330771,12862549,617.0,10421,9490820,1.0,0.25,1.0,116,0.203613,33685.0,0.0,2211867,2985020,1.0,7501,0.0,10445,1350092.0,1448610.0,0.194722,24.799086,1347841.0,117.200415,924512.716722,0.194722,132641.919784,1362590.0,364.200378,1350523.0,586614.577554,9.628967,0.194733,0.194728,5.089473,0.311931,4624656.0,4732334.0,3012621.0,7456908.0,78.429916,47254.617938,7544565.0,15.548887,0.19469,2.77794,1625507.0,0.1947,19277.146484,1.830111,1351196.0,20465360.0,6.074394,214385.97745,0.180465,60931.952706
9685226,166,155841,0.218872,0.0,1,1,49354,0.215942,1857,9912,183.0,781,8810,0,0.218872,0.09491,12066756 0.239624\n5255865 0.199585\n13...,0.0,0.0,9490820,1241173,7577062,12862549,0.0,10931,9490820,0.0,0.188721,0.0,1611900,0.203613,80686.0,0.0,2743,2985020,0.0,2409,0.0,10931,1350092.0,1448610.0,0.194722,24.799086,1347841.0,117.200415,924512.716722,0.194722,132641.919784,1362590.0,364.200378,1350523.0,586614.577554,9.628967,0.194733,0.194728,5.089473,0.311931,4624656.0,4732334.0,3012621.0,7456908.0,78.429916,47254.617938,7544565.0,15.548887,0.19469,2.77794,1625507.0,0.1947,19277.146484,1.830111,1351196.0,20465360.0,6.074394,214385.97745,0.180465,60931.952706


In [17]:

X_train['F17'] = float(X_train['F17'])
X_valid['F17'] = float(X_valid['F17'])
param = {'n_estimators:': 2000, 'max_depth': 37, 'num_leaves': 301, 'subsample': 0.7956508044572318, 'subsample_freq': 1, 'min_child_samples': 195}
model = lgb.LGBMClassifier(random_state=42, **param)

model.fit(X_train, y_train, eval_metric='auc' )
pred = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred[:, 1])
score

TypeError: cannot convert the series to <class 'float'>

In [19]:
temp_t = pd.read_csv('/home/workspace/DACON/Click_predict/Model/LGBM_Experiment/Count_target_X_train.csv') 
temp_v = pd.read_csv('/home/workspace/DACON/Click_predict/Model/LGBM_Experiment/Count_target_X_valid.csv')

X_train['F17'] = temp_t['F17']
X_valid['F17'] = temp_v['F17']
del(temp_t)
del(temp_v)
X_train.head()

: 

In [None]:
param = {'n_estimators:': 2000, 'max_depth': 37, 'num_leaves': 301, 'subsample': 0.7956508044572318, 'subsample_freq': 1, 'min_child_samples': 195}
model = lgb.LGBMClassifier(random_state=42, **param)

model.fit(X_train, y_train, eval_metric='auc' )
pred = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred[:, 1])
score