In [1]:
import numpy as np
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append("../..")
from utils import *
import warnings
warnings.filterwarnings("ignore")
import os
import psutil

pd.options.display.max_columns=1000
pd.options.display.max_rows=1000

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 
from xgboost import XGBClassifier
import optuna
from optuna import Trial
from optuna.samplers import TPESampler


In [2]:
def objective(trial, t_X, t_y, v_X, v_y):

  param = {"n_estimators": trial.suggest_int("n_estimators:", 100, 1000),
           "max_depth": trial.suggest_int("max_depth", 6, 30),
           "subsample": trial.suggest_float("subsample", 0.3, 1.0),
           "learning_rate":  trial.suggest_float("learning_rate", 0.01, 0.3),
           'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
           'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
           'min_child_weight': trial.suggest_int('min_child_weight', 2, 50)}

  model = XGBClassifier(random_state=42,  tree_method= 'gpu_hist', **param)
  model.fit(t_X, t_y, eval_metric='auc')
  pred = model.predict_proba(v_X)
  score = roc_auc_score(v_y, pred[:, 1])

  return score

In [2]:
''' Seed '''
seed_everything(42)

''' Data Load '''
train, test, sample_submission = load_data()

In [3]:
X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

Feature Selection
Start Frequency
Missing Value
---------------- Start MissingValue ----------------
Memory usage of dataframe is 7856.71 MB
Memory usage after optimization is: 3240.89 MB
Decreased by 58.8%
Memory usage of dataframe is 872.97 MB
Memory usage after optimization is: 360.10 MB
Decreased by 58.8%
Memory usage of dataframe is 1385.05 MB
Memory usage after optimization is: 562.68 MB
Decreased by 59.4%


In [4]:
X_train

Unnamed: 0,F01,F02,F03,F04,F05,F06,F07,F08,F09,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28,F29,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
12066756,24,39,0.169434,46.0,24,0,1033,0.175781,5812,24,514.0,24,9051,20,0.167969,0.258057,0.239624,2.0,1.0,27707,8191097,7577062,4311788,6.0,7094,10062514,11.0,0.186523,5.0,1190,0.203613,0.0,1.0,24,22758893,1.0,9051,0.0,7094
5255865,2211867,2369136,0.218872,0.0,2211863,4,60483,0.215942,233449,2211867,13.0,2211867,1635381,3,0.218872,0.319092,0.199585,3.0,0.0,9490820,8191097,7577062,12862549,7.0,227635,9490820,7.0,0.199707,1.0,2526,0.203613,5152.0,0.0,2211867,22758893,3.0,238589,0.0,558317
13191452,1111326,1111326,0.187256,5.0,1111326,29,2270,0.215942,8186,1111326,266.0,1111326,25173,4,0.223511,0.187256,0.239624,6.0,1.0,436781,8191097,96765,12862549,4.0,3539,1628146,1.0,0.188721,1.0,377455,0.318359,23.0,3.0,1111326,22758893,0.0,25173,0.0,3539
24944338,2211867,2369136,0.218872,0.0,2211863,656,10499,0.242798,7501,2211867,163.0,2211867,736398,5,0.218872,0.104492,0.147705,1.0,0.0,9490820,8191097,3330771,12862549,617.0,10421,9490820,1.0,0.250000,1.0,116,0.203613,33685.0,0.0,2211867,2985020,1.0,7501,0.0,10445
9685226,166,155841,0.218872,0.0,1,1,49354,0.215942,1857,9912,183.0,781,8810,0,0.218872,0.094910,0.172729,0.0,0.0,9490820,1241173,7577062,12862549,0.0,10931,9490820,0.0,0.188721,0.0,1611900,0.203613,80686.0,0.0,2743,2985020,0.0,2409,0.0,10931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20510049,969,46107,0.169434,22.0,969,1,5908,0.175781,429,8492,760.0,969,8051,8,0.167969,0.409668,0.249512,7.0,0.0,52858,2855034,3330771,12862549,15.0,17058,10062514,77.0,0.188721,16.0,3271,0.318359,2149.0,0.0,969,22758893,7.0,1050,0.0,17058
13650257,119569,610845,0.218872,0.0,119569,2,4846140,0.175781,48255,468714,0.0,119569,57873,5,0.218872,0.155884,0.087158,5.0,0.0,9490820,8191097,1977292,12862549,0.0,98854,9490820,0.0,0.188721,0.0,583325,0.203613,41.0,0.0,122242,22758893,5.0,57302,0.0,98856
9074937,5289530,5305278,0.169434,1.0,5289530,0,55834,0.175781,38570,5289530,623.0,5289530,1063558,3,0.167969,0.144165,0.239624,1.0,0.0,5289444,273957,7577062,17046,0.0,274224,10062514,22.0,0.188721,3.0,8586,0.151733,2867.0,0.0,5289530,22758893,1.0,38570,0.0,274420
25352486,57699,60135,0.169434,3.0,57699,2,5699,0.175781,4847,57699,25.0,57699,28481,0,0.223511,0.134766,0.147705,3.0,2.0,3676,8191097,165500,48788,0.0,6774,10062514,0.0,0.139526,0.0,107,0.203613,5.0,6.0,57699,22758893,0.0,28481,0.0,6781


In [8]:
param = {'max_depth': 30,
    'num_leaves': 500,
    'subsample': 0.9757365622458185,
    'subsample_freq': 8,
    'n_estimators' : 3000,
    'min_child_samples': 136}
model = lgb.LGBMClassifier(random_state=42, **param)
model.fit(X_train, y_train, eval_metric='auc')
pred_val = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

0.7817629322117648


In [None]:
param = {'max_depth': 30,
    'num_leaves': 500,
    'subsample': 0.9757365622458185,
    'n_estimators' : 3000,
    'min_child_samples': 500}

model = XGBClassifier(random_state=42, **param)
model.fit(X_train, y_train, eval_metric='auc')
pred_val = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

In [10]:
test = test.drop(columns = ["ID"])

In [11]:
pred = model.predict_proba(test)

In [13]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = pred[:, 1]
sample_submission.to_csv('lgbm_count_label.csv', index=False)

In [None]:
# nomalized X

In [2]:
''' Seed '''
seed_everything(42)

''' Data Load '''
train, test, sample_submission = load_data()

X_train = train.drop(columns = ["Click"])
y_train = train["Click"]
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

''' preprocessing '''
X_train, X_valid, y_train, y_valid, test = preprocessing(X_train, X_valid, y_train, y_valid, test, True) 

param = {'max_depth': 25,
    'num_leaves': 306,
    'subsample': 0.9757365622458185,
    'subsample_freq': 8,
    'n_estimators' : 1000,
    'min_child_samples': 136}
model = lgb.LGBMClassifier(random_state=42, **param)
model.fit(X_train, y_train, eval_metric='auc')
pred_val = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

Feature Selection
Start Frequency
Missing Value
---------------- Start MissingValue ----------------
Memory usage of dataframe is 7856.71 MB
Memory usage after optimization is: 3240.89 MB
Decreased by 58.8%
Memory usage of dataframe is 872.97 MB
Memory usage after optimization is: 360.10 MB
Decreased by 58.8%
Memory usage of dataframe is 1385.05 MB
Memory usage after optimization is: 562.68 MB
Decreased by 59.4%
0.781945008016653


In [4]:
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

0.781945008016653


In [5]:
test = test.drop(columns = ["ID"])
pred = model.predict_proba(test)

In [6]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = pred[:, 1]
sample_submission.to_csv('lgbm_count_label_nomalized_x.csv', index=False)

In [None]:
model_fold = lgb.LGBMClassifier(random_state=42, **param)
soft_voting_value = Kfold(model_fold, 3, X_train, y_train, test, True)

In [None]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = soft_voting_value
sample_submission.to_csv('lgbm_kfold.csv', index=False)

In [6]:
param = {'max_depth': 30,
    'num_leaves': 500,
    'subsample': 0.9757365622458185,
    'subsample_freq': 3,
    'n_estimators' : 3000,
    'min_child_samples': 136}
model = lgb.LGBMClassifier(random_state=42, **param)
model.fit(X_train, y_train, eval_metric='auc')
pred_val = model.predict_proba(X_valid)
score = roc_auc_score(y_valid, pred_val[:, 1])
print(score)

0.7834617635779335


In [8]:
test = test.drop(columns = ["ID"])
pred = model.predict_proba(test)

''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = pred[:,1]
sample_submission.to_csv('lgbm_large_estimator.csv', index=False)

### stacking

In [10]:
model_xgb = XGBClassifier(random_state=42, tree_method= 'gpu_hist', **param)

model_xgb.fit(X_train, y_train, eval_metric='auc')

pred_xgb = model_xgb.predict_proba(X_valid)

first_level = pd.DataFrame(pred[:,1], columns=['lgbm'])
first_level['xgb'] = pred_xgb[:,1]

first_level.head(20)

KeyboardInterrupt: 

In [11]:
pred = model_xgb.predict_proba(test)

''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = pred[:,1]
sample_submission.to_csv('xgb_large_estimator.csv', index=False)

NotFittedError: need to call fit or load_model beforehand

In [None]:
from sklearn.linear_model import LinearRegression

meta_model = LinearRegression(n_jobs=-1)
first_level.drop('label', axis=1, inplace=True)
meta_model.fit(first_level, y_valid)
ensemble_pred = meta_model.predict(first_level)
sub_pred = meta_model.predict(test)
sub_pred

In [None]:
''' Submission '''
sample_submission = pd.read_csv('/home/workspace/DACON/Click_predict/data/sample_submission.csv')
sample_submission['Click'] = sub_pred
sample_submission.to_csv('stacking.csv', index=False)