In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
train_data = pd.read_csv("../data/train_dummy_encoded.csv")
test_data = pd.read_csv("../data/test_dummy_encoded.csv")
sample_submission = pd.read_csv("../data/sample_submission.csv")

In [3]:
train_data

Unnamed: 0,Age,Region_Code,Vehicle_Age,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender_Female,Gender_Male,Driving_License_0,Driving_License_1,Previously_Insured_0,Previously_Insured_1,Vehicle_Damage_No,Vehicle_Damage_Yes
0,21,35,0,65101.0,119,187,0,0,1,0,1,1,0,0,1
1,43,28,2,58911.0,25,288,1,0,1,0,1,1,0,0,1
2,25,14,1,38043.0,142,254,0,1,0,0,1,0,1,1,0
3,35,1,0,2630.0,146,76,0,1,0,0,1,1,0,0,1
4,36,15,0,31951.0,142,294,0,1,0,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11504793,48,6,0,27412.0,25,218,0,0,1,0,1,1,0,0,1
11504794,26,36,1,29509.0,142,115,1,1,0,0,1,1,0,0,1
11504795,29,32,1,2630.0,142,189,0,1,0,0,1,0,1,1,0
11504796,51,28,0,48443.0,25,274,1,1,0,0,1,1,0,0,1


In [4]:
from sklearn.model_selection import train_test_split

sample_data = train_data

data = sample_data[:3000000]

X = data.drop('Response', axis=1)
y = data['Response']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### ROC_AUC_SCORE는 predict_proba() 함수를 사용하여 얻을 수 있습니다.

In [5]:
# 하이퍼파라미터 최적화
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgbm = LGBMClassifier()

params = {
    'num_leaves': (2, 100),
    'max_depth': (2, 100),
    'learning_rate': (0.001, 0.1),
    'min_child_samples': (2, 100),
    'subsample': (0.1, 1.0),
    'colsample_bytree': (0.1, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
}

opt = BayesSearchCV(lgbm, params, n_iter=32, n_jobs=-1, cv=skf, random_state=42)

opt.fit(x_train, y_train)

opt.best_params_

opt.best_score_

[LightGBM] [Info] Number of positive: 235928, number of negative: 1684072
[LightGBM] [Info] Number of positive: 235927, number of negative: 1684073
[LightGBM] [Info] Number of positive: 235927, number of negative: 1684073
[LightGBM] [Info] Number of positive: 235927, number of negative: 1684073
[LightGBM] [Info] Number of positive: 235927, number of negative: 1684073
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 746
[LightGBM] [Info] Number of data points in the train set: 1920000, number of used features: 14
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 745
[LightGBM] [Info]

0.8802299999999998

- 300만개로 돌렸을 때 15분 정도 소요

In [10]:
roc_auc_score(y_test, opt.predict_proba(x_test)[:,1])

0.8777260582233574

In [11]:
opt.best_estimator_

In [12]:
opt.predict_proba(test_data)[:,1]

sample_submission['Response'] = opt.predict_proba(test_data)[:,1]

sample_submission

Unnamed: 0,id,Response
0,11504798,0.014807
1,11504799,0.517191
2,11504800,0.244998
3,11504801,0.000151
4,11504802,0.031960
...,...,...
7669861,19174659,0.224008
7669862,19174660,0.000193
7669863,19174661,0.000473
7669864,19174662,0.620771


In [9]:
sample_submission.to_csv("../data/submission_lgbm_0725_opt_data3000000train.csv", index=False)