In [2]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.9.2-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.9.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0

In [4]:
import numpy as np
import pandas as pd
import optuna
import warnings
warnings.filterwarnings('ignore')

In [5]:
train = pd.read_csv('/content/drive/MyDrive/kaggle/playground/train.csv')
test = pd.read_csv('/content/drive/MyDrive/kaggle/playground/test.csv')

In [8]:
# feature engineering을 위해 합쳐서 진행
df = pd.concat([train,test], axis = 0)

In [9]:
# Time 컬럼을 활용하기 위해 일, 시간으로 나눠서 분리
df['hour'] = df['Time'] % (24*3600) // 3600
df['day'] = (df['Time']//(24*3600))%7

In [11]:
def across_feat(df):
  features = [feat for feat in df.columns if 'V' in feat] # V를 가진 컬럼만 이용
  df['V_Sum'] = df[features].sum(axis = 1)
  df['V_Min'] = df[features].min(axis = 1)
  df['V_Max'] = df[features].max(axis = 1)
  df['V_Avg'] = df[features].mean(axis = 1)
  df['V_Std'] = df[features].std(axis = 1)
  df['V_Pos'] = df[features].gt(0).sum(axis = 1) # gt(0)을 사용하면 0보다 큰지 아닌지 알려준다. TF
  df['V_Neg'] = df[features].lt(0).sum(axis = 1) # lt(0)을 사용하면 0보다 작은 지 알려준다. TF
  df['V_Range'] = abs(df['V_Min'] - df['V_Max'])
  return df

df = across_feat(df)

In [15]:
# 스케일링 작업
# 이상치가 있으므로 이상치에 민감하지 않은 RobustScale 사용

from sklearn.preprocessing import RobustScaler
scale = RobustScaler()

# 타겟 값인 Class 를 y로 빼놓는다.
y = df['Class']
# 사용하지 않을 컬럼인 id를 제외하고 타겟값도 제외한 이후 스케일링 작업 진행
df = df.drop(['id','Class'], axis = 1)
df[df.columns] = scale.fit_transform(df[df.columns])

In [16]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,hour,day,V_Sum,V_Min,V_Max,V_Avg,V_Std,V_Pos,V_Neg,V_Range
0,-1.020939,0.792952,-0.114744,-0.714247,0.331701,-0.11179,-0.876467,0.125828,-0.668006,0.679359,...,-1.875,0.0,-0.643684,0.502563,-0.643692,-0.641861,-0.627483,-0.75,0.75,-0.709642
1,-1.020939,0.75849,-0.886988,-0.386497,-0.48085,-0.791447,-0.065376,-0.851448,-0.054594,-0.008917,...,-1.875,0.0,-0.378157,0.456037,-0.378163,-0.374531,-0.340986,-1.25,1.25,-0.430918
2,-1.020939,-0.11205,0.666091,-0.228311,-0.195398,0.516921,-0.684943,0.843749,-0.31857,0.045946,...,-1.875,0.0,-0.550435,0.755813,-0.550443,-0.527858,-0.545991,-0.75,0.75,-0.633198
3,-1.020939,0.749737,-0.152978,-0.675322,0.149772,-0.128308,-0.391757,-0.044359,-0.221921,0.795428,...,-1.875,0.0,-0.221868,0.166088,-0.221874,-0.224165,-0.191052,-1.25,1.25,-0.24598
4,-1.020939,0.314419,-0.143942,0.530475,0.848207,-0.436261,1.117602,-0.773747,0.960421,0.639392,...,-1.875,0.0,0.890122,0.812562,0.890122,0.906381,0.918202,0.5,-0.5,0.854372


In [17]:
# 이제 다시 train과 test 값 분리 시키기
train = df.iloc[:-len(test),:]
test = df.iloc[-len(test):, :].reset_index(drop = True)
train['Class'] = y[:-len(test)]

In [18]:
# 오버 샘플링 / 언더 샘플링 둘 다 진행 해보기
oversample = train[train['Class']==1]
undersample = train[train['Class']==0]

# 오버샘플링의 경우 소수의 것을 크게 키우는 것이므로 목표 타겟인 Class가 1인 것으로 뽑기
# 언더 샘플링의 경우 다수의 것을 작게 하는 것이므로 Class가 0인 것으로 뽑기

X = train.drop(['Class'], axis = 1)
y = train['Class']
X_test = test

In [19]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [26]:
import catboost
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

# 찾고자 하는 하이퍼 파라미터들 값 넣어주기 (러닝 시간 상 몇개 제외외)
def catboost_objective(trial):
  learning_rate = trial.suggest_float('learning_rate', 0,0.3)
  depth = trial.suggest_int('depth',3,10)
#  n_estimators = trial.suggest_int('n_estimators', 50, 500)
#  subsample = trial.suggest_float('subsample',0,1)
#  l2_leaf_reg = trial.suggest_int('l2_leaf_reg',1,100)
#  min_data_in_leaf = trial.suggest_int('min_data_in_leaf',1,100)

# 훈련셋, 테스트 셋 나누기기
  x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

# 하이퍼 파라미터 값 넣어서 fit 해주기기
  model = catboost.CatBoostRegressor(
      random_seed = 1,
      iterations = 3000,
      early_stopping_rounds = 1000,
      eval_metric = 'RMSE',
      verbose = 3000,
      learning_rate = learning_rate, depth = depth 
 #     n_estimators = n_estimators, subsample = subsample,
 #     l2_leaf_reg = l2_leaf_reg, min_data_in_leaf = min_data_in_leaf
  )
  model.fit(x_train, y_train)

# 출력 결과는 이 문제의 평가 점수인 roc_auc_score통해서 나오도록 함함
  return roc_auc_score(y_test, model.predict(x_test))

# optuna를 통해서 최대의 값이 나오도록 n_trials만큼 반복하여 최적 파라미터 값 구하기기
study = optuna.create_study(direction = 'maximize')
study.optimize(catboost_objective, n_trials = 30)

[32m[I 2023-02-06 12:05:41,420][0m A new study created in memory with name: no-name-4b12cbeb-2ca5-4205-b178-fd7baf6db884[0m


0:	learn: 0.0459094	total: 114ms	remaining: 5m 42s
2999:	learn: 0.0066704	total: 3m 24s	remaining: 0us


[32m[I 2023-02-06 12:09:07,308][0m Trial 0 finished with value: 0.6514064874837259 and parameters: {'learning_rate': 0.24387491617961055, 'depth': 8}. Best is trial 0 with value: 0.6514064874837259.[0m


0:	learn: 0.0457440	total: 25.3ms	remaining: 1m 15s


[32m[I 2023-02-06 12:10:36,225][0m Trial 1 finished with value: 0.6408024653330047 and parameters: {'learning_rate': 0.25741042061640523, 'depth': 4}. Best is trial 0 with value: 0.6514064874837259.[0m


2999:	learn: 0.0218898	total: 1m 27s	remaining: 0us
0:	learn: 0.0459516	total: 20.7ms	remaining: 1m 2s


[32m[I 2023-02-06 12:11:50,601][0m Trial 2 finished with value: 0.6251694269631685 and parameters: {'learning_rate': 0.2506347296475029, 'depth': 3}. Best is trial 0 with value: 0.6514064874837259.[0m


2999:	learn: 0.0312955	total: 1m 13s	remaining: 0us
0:	learn: 0.0455380	total: 26.5ms	remaining: 1m 19s
2999:	learn: 0.0136841	total: 1m 47s	remaining: 0us


[32m[I 2023-02-06 12:13:39,320][0m Trial 3 finished with value: 0.6421513515985912 and parameters: {'learning_rate': 0.2822593116283466, 'depth': 5}. Best is trial 0 with value: 0.6514064874837259.[0m


0:	learn: 0.0461913	total: 21.6ms	remaining: 1m 4s


[32m[I 2023-02-06 12:14:54,227][0m Trial 4 finished with value: 0.6555636119374435 and parameters: {'learning_rate': 0.08531365748247745, 'depth': 3}. Best is trial 4 with value: 0.6555636119374435.[0m


2999:	learn: 0.0366291	total: 1m 13s	remaining: 0us
0:	learn: 0.0457988	total: 237ms	remaining: 11m 51s
2999:	learn: 0.0229676	total: 10m 59s	remaining: 0us


[32m[I 2023-02-06 12:25:55,417][0m Trial 5 finished with value: 0.6923270253706827 and parameters: {'learning_rate': 0.0498987755677961, 'depth': 10}. Best is trial 5 with value: 0.6923270253706827.[0m


0:	learn: 0.0470817	total: 122ms	remaining: 6m 7s


[33m[W 2023-02-06 12:29:36,981][0m Trial 6 failed with parameters: {'learning_rate': 0.02009348996109057, 'depth': 9} because of the following error: KeyboardInterrupt('').[0m
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-26-38ee908ea621>", line 24, in catboost_objective
    model.fit(x_train, y_train)
  File "/usr/local/lib/python3.8/dist-packages/catboost/core.py", line 5730, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "/usr/local/lib/python3.8/dist-packages/catboost/core.py", line 2355, in _fit
    self._train(
  File "/usr/local/lib/python3.8/dist-packages/catboost/core.py", line 1759, in _train
    self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)
  File "_catboost.pyx", lin

KeyboardInterrupt: ignored

In [None]:
# optuna를 통해 구한 최고의 하이퍼파라미터
study.best_params

In [None]:
# 이후 나오는 최적의 하이퍼 파라미터 값 넣기기
catboost_params = {'n_estimators':500,
                   'learning_rate':0.03,
                   'one_hot_max_size' : 12,
                   'depth' : 4,
                   'l2_leaf_reg' : 0.014,
                   'colsample_bylevel' : 0.06,
                   'min_data_in_leaf' : 12,
                   'bootstrap_type' : 'Bernoulli',
                   'verbose' : False}
repeats = 30
modelsCB = []
predsCB = []

# 언더 샘플링 방식을 통해서 모델 예측 값 구하기기
for i in range(repeats):
  sample = undersample.sample(n_sample_size)
  merged = pd.concat([oversample, sample])
  X = merged.drop('Class', axis = 1)
  y = merged['Class']
  model = catboost.CatBoostRegressor(**catboost_params)
  model.fit(X,y,early_stopping_rounds = 1000)
  modelsCB.append(model)
  predsCB.append(model.predict(test))

> 이렇게 catboost, xgboost, lgbm 반복




In [None]:
def coef_objective(trial):
    a = trial.suggest_float('a', 0, 1)
    b = trial.suggest_float('b', 0, 1)
    c = trial.suggest_float('c', 0, 1)
    d = trial.suggest_float('d', 0, 1)
    e = trial.suggest_float('e', 0, 1)
    f = trial.suggest_float('f', 0, 1)
    
    preds_eval = []
    for model in modelsCB:
        preds_eval.append(model.predict(X))
    
    resCB = np.average(np.array(preds_eval),axis=0)

    preds_eval = []
    for model in modelsCBC:
        preds_eval.append(model.predict_proba(X)[:, 1])
    
    resCBC = np.average(np.array(preds_eval),axis=0)

    preds_eval = []
    for model in modelsXB:
        preds_eval.append(model.predict(X))
    
    resXB = np.average(np.array(preds_eval),axis=0)
    
    preds_eval = []
    for model in modelsXBC:
        preds_eval.append(model.predict_proba(X)[:, 1])
    
    resXBC = np.average(np.array(preds_eval),axis=0)
    
    preds_eval = []
    for model in modelsLBC:
        preds_eval.append(model.predict_proba(X)[:, 1])
    
    resLBC = np.average(np.array(preds_eval),axis=0)
    
    preds_eval = []
    for model in modelsLB:
        preds_eval.append(model.predict(X))
    
    resLB = np.average(np.array(preds_eval),axis=0)



    res = roc_auc_score(y,
                        (resCB * a + resXB * b + resCBC * c + resXBC * d + resLBC * e + 
                         resLB * f )/(a + b + c + d + e + f))

    return res
# optuna를 통해 최적의 값 찾아내기기
study = optuna.create_study(direction= 'maximize')
study.optimize(coef_objective, n_trials= 30)

In [None]:
# 각 모델의 예측 결과들을 평균낸 다음 최적의 a~f 값들을 곱해줘서 결론 도출출
predCB = np.average(np.array(predsCB),axis=0).clip(0,1)
predXB = np.average(np.array(predsXB),axis=0).clip(0,1)
predCBC = np.average(np.array(predsCBC),axis=0).clip(0,1)
predXBC = np.average(np.array(predsXBC),axis=0).clip(0,1)
predLBC = np.average(np.array(predsLBC),axis=0).clip(0,1)
predLB = np.average(np.array(predsLB),axis=0).clip(0,1)
pred = predCB * a + predXB * b + predCBC * c + predXBC * d + predLBC * e + predLB * f
submission['Class'] = pred