In [1]:
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("../..")
from utils import *
import warnings
warnings.filterwarnings("ignore")
import os
import psutil

pd.options.display.max_columns=1000
pd.options.display.max_rows=1000

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
from xgboost import XGBClassifier
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


In [2]:
def objective(trial, t_X, t_y, v_X, v_y):

  param = {"n_estimators": trial.suggest_int("n_estimators:", 100, 1000),
           "max_depth": trial.suggest_int("max_depth", 6, 30),
           "subsample": trial.suggest_float("subsample", 0.3, 1.0),
           "learning_rate":  trial.suggest_float("learning_rate", 0.01, 0.3),
           'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
           'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
           'min_child_weight': trial.suggest_int('min_child_weight', 2, 50)}

  model = XGBClassifier(random_state=42,  tree_method= 'gpu_hist', **param)
  model.fit(t_X, t_y, eval_metric='auc')
  pred = model.predict_proba(v_X)
  score = roc_auc_score(v_y, pred[:, 1])

  return score

In [3]:
''' Seed '''
seed_everything(42)

''' Data Load '''
train, test, sample_submission = load_data()

In [4]:
X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

Feature Selection
Start Frequency
Missing Value
---------------- Start MissingValue ----------------
Memory usage of dataframe is 7856.71 MB
Memory usage after optimization is: 2307.91 MB
Decreased by 70.6%
Memory usage of dataframe is 872.97 MB
Memory usage after optimization is: 256.43 MB
Decreased by 70.6%
Memory usage of dataframe is 1385.05 MB
Memory usage after optimization is: 398.20 MB
Decreased by 71.2%


In [5]:
X_train

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
12066756,9.536743e-07,0.000001,0.169434,46.0,9.536743e-07,0,0.000040,0.175781,0.000226,9.536743e-07,514.0,9.536743e-07,0.000352,20,0.167969,0.258057,0.239624,2.0,1.0,0.001076,0.318115,0.294434,0.167480,6.0,0.000276,0.390869,11.0,0.186523,5.0,0.000046,0.203613,0.0,1.0,9.536743e-07,0.883789,1.0,0.000352,0.0,0.000276
5255865,8.593750e-02,0.092041,0.218872,0.0,8.593750e-02,4,0.002350,0.215942,0.009071,8.593750e-02,13.0,8.593750e-02,0.063538,3,0.218872,0.319092,0.199585,3.0,0.0,0.368652,0.318115,0.294434,0.499512,7.0,0.008842,0.368652,7.0,0.199707,1.0,0.000098,0.203613,5152.0,0.0,8.593750e-02,0.883789,3.0,0.009270,0.0,0.021683
13191452,4.315186e-02,0.043152,0.187256,5.0,4.315186e-02,29,0.000088,0.215942,0.000318,4.315186e-02,266.0,4.315186e-02,0.000978,4,0.223511,0.187256,0.239624,6.0,1.0,0.016968,0.318115,0.003759,0.499512,4.0,0.000137,0.063232,1.0,0.188721,1.0,0.014664,0.318359,23.0,3.0,4.315186e-02,0.883789,0.0,0.000978,0.0,0.000137
24944338,8.593750e-02,0.092041,0.218872,0.0,8.593750e-02,656,0.000408,0.242798,0.000291,8.593750e-02,163.0,8.593750e-02,0.028610,5,0.218872,0.104492,0.147705,1.0,0.0,0.368652,0.318115,0.129395,0.499512,617.0,0.000405,0.368652,1.0,0.250000,1.0,0.000005,0.203613,33685.0,0.0,8.593750e-02,0.115967,1.0,0.000291,0.0,0.000406
9685226,6.437302e-06,0.006054,0.218872,0.0,5.960464e-08,1,0.001917,0.215942,0.000072,3.850460e-04,183.0,3.033876e-05,0.000342,0,0.218872,0.094910,0.172729,0.0,0.0,0.368652,0.048218,0.294434,0.499512,0.0,0.000425,0.368652,0.0,0.188721,0.0,0.062622,0.203613,80686.0,0.0,1.065731e-04,0.115967,0.0,0.000094,0.0,0.000425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20510049,3.761053e-05,0.001791,0.169434,22.0,3.761053e-05,1,0.000229,0.175781,0.000017,3.299713e-04,760.0,3.761053e-05,0.000313,8,0.167969,0.409668,0.249512,7.0,0.0,0.002052,0.110901,0.129395,0.499512,15.0,0.000663,0.390869,77.0,0.188721,16.0,0.000127,0.318359,2149.0,0.0,3.761053e-05,0.883789,7.0,0.000041,0.0,0.000663
13650257,4.642487e-03,0.023727,0.218872,0.0,4.642487e-03,2,0.188232,0.175781,0.001874,1.820374e-02,0.0,4.642487e-03,0.002249,5,0.218872,0.155884,0.087158,5.0,0.0,0.368652,0.318115,0.076782,0.499512,0.0,0.003839,0.368652,0.0,0.188721,0.0,0.022659,0.203613,41.0,0.0,4.749298e-03,0.883789,5.0,0.002226,0.0,0.003839
9074937,2.054443e-01,0.206055,0.169434,1.0,2.054443e-01,0,0.002169,0.175781,0.001498,2.054443e-01,623.0,2.054443e-01,0.041321,3,0.167969,0.144165,0.239624,1.0,0.0,0.205444,0.010643,0.294434,0.000662,0.0,0.010651,0.390869,22.0,0.188721,3.0,0.000334,0.151733,2867.0,0.0,2.054443e-01,0.883789,1.0,0.001498,0.0,0.010658
25352486,2.241135e-03,0.002337,0.169434,3.0,2.241135e-03,2,0.000221,0.175781,0.000188,2.241135e-03,25.0,2.241135e-03,0.001106,0,0.223511,0.134766,0.147705,3.0,2.0,0.000143,0.318115,0.006428,0.001895,0.0,0.000263,0.390869,0.0,0.139526,0.0,0.000004,0.203613,5.0,6.0,2.241135e-03,0.883789,0.0,0.001106,0.0,0.000263


In [8]:
param = {'max_depth': 25,
    'num_leaves': 306,
    'subsample': 0.9757365622458185,
    'subsample_freq': 8,
    'n_estimators' : 1000,
    'min_child_samples': 136}
model = lgb.LGBMClassifier(random_state=42, **param)
model.fit(X_train, y_train, eval_metric='auc')
pred_val = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

0.7817629322117648


In [10]:
test = test.drop(columns = ["ID"])

In [11]:
pred = model.predict_proba(test)

In [13]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = pred[:, 1]
sample_submission.to_csv('lgbm_count_label.csv', index=False)

In [None]:
# nomalized X

In [2]:
''' Seed '''
seed_everything(42)

''' Data Load '''
train, test, sample_submission = load_data()

X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

param = {'max_depth': 25,
    'num_leaves': 306,
    'subsample': 0.9757365622458185,
    'subsample_freq': 8,
    'n_estimators' : 1000,
    'min_child_samples': 136}
model = lgb.LGBMClassifier(random_state=42, **param)
model.fit(X_train, y_train, eval_metric='auc')
pred_val = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

Feature Selection
Start Frequency
Missing Value
---------------- Start MissingValue ----------------
Memory usage of dataframe is 7856.71 MB
Memory usage after optimization is: 3240.89 MB
Decreased by 58.8%
Memory usage of dataframe is 872.97 MB
Memory usage after optimization is: 360.10 MB
Decreased by 58.8%
Memory usage of dataframe is 1385.05 MB
Memory usage after optimization is: 562.68 MB
Decreased by 59.4%
0.781945008016653


In [4]:
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

0.781945008016653


In [5]:
test = test.drop(columns = ["ID"])
pred = model.predict_proba(test)

In [6]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = pred[:, 1]
sample_submission.to_csv('lgbm_count_label_nomalized_x.csv', index=False)

In [None]:
model_fold = lgb.LGBMClassifier(random_state=42, **param)
soft_voting_value = Kfold(model_fold, 3, X_train, y_train, test, True)

In [None]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = soft_voting_value
sample_submission.to_csv('lgbm_kfold.csv', index=False)