## 데이터 불러오기

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import random
import os

import optuna
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from optuna.samplers import TPESampler

from optuna import Trial

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import ExtraTreesRegressor
%matplotlib inline

pd.set_option('mode.chained_assignment',  None) # <==== 경고를 끈다

filename = 'data/train.csv'
data_train = pd.read_csv(filename)

filename = 'data/test.csv'
data_test = pd.read_csv(filename)

filename = 'data/sample_submission.csv'
submission = pd.read_csv(filename)

# 데이터 분석

### 데이터 target 분포 확인

In [2]:
from collections import Counter

def print_mode(df, col):

  cnt = Counter(df[col])
  list_cnt = cnt.most_common(3)

  for idx, value in enumerate(list_cnt):

    print(f'{col}의 최빈값 {idx+1}순위 : {value[0]} & {value[-1]}개')

In [3]:
def print_statistics(df, col):

  max = df['착과량(int)'].max()
  min = df['착과량(int)'].min()
  mean = df['착과량(int)'].mean()
  median = df['착과량(int)'].median()

  print(f'{col}의 최대값 : {max}')
  print(f'{col}의 최소값 : {min}')
  print(f'{col}의 평균값 : {mean}')
  print(f'{col}의 중앙값 : {median}')
  print_mode(df, col)

In [4]:
def identify_hist(df, col):

  sns.histplot(data=df[col], kde=True)
  print_statistics(df, col)

# 데이터 전처리

In [5]:
#학습, 정답데이터 분리
y_train = data_train['착과량(int)']
X_drop_list = ['ID']
X_train = data_train.drop(X_drop_list, axis = 1)
X_test = data_test.drop(["ID"], axis = 1)

In [6]:
#feature selection
high_corr = data_train.corr().abs().sort_values(by='착과량(int)',ascending=False).iloc[:,:1]
features_name = high_corr[high_corr['착과량(int)']>0.9].index
features_name = list(features_name)
features_name.remove('착과량(int)')
X,y = X_train.drop(['착과량(int)'], axis=1) , X_train['착과량(int)']

X = X[features_name]
X_test = X_test[features_name]

### 파생변수 생성

# 모델 생성

In [7]:
#seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### nmae score metric

In [8]:
#base version
def NMAE(true, pred):
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

#cross_val custom version
def NMAE_CV(clf, x, y):
    pred = clf.predict(x)
    mae = np.mean(np.abs(y - pred))
    score = mae / np.mean(np.abs(y))
    return score

### TabnetRegressor

In [9]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

In [10]:
X_numpy = X.to_numpy()
y_numpy = y.to_numpy().reshape(-1, 1)
X_test_numpy = X_test.to_numpy()

In [11]:
skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
skf = KFold(n_splits=5, shuffle=True, random_state=42)

In [12]:
tab_pred = []
i = 0
tab_nmae = []

for tr_idx, val_idx in skf.split(X_numpy, y_numpy):
    
    tr_x, tr_y = X_numpy[tr_idx], y_numpy[tr_idx]
    val_x, val_y = X_numpy[val_idx], y_numpy[val_idx]

    tab = TabNetRegressor(verbose = 100,seed = 42,optimizer_fn=torch.optim.AdamW)
    tab.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], patience=100, max_epochs=2000, eval_metric = ['mae'])

    val_pred = tab.predict(val_x).astype(int)
    fold_nmae = NMAE(val_y, val_pred)
    tab_nmae.append(fold_nmae)
    print(f"{i + 1} Fold NMAE = {fold_nmae}")

    i += 1

    fold_pred = tab.predict(X_test_numpy)
    tab_pred.append(fold_pred)

print(f"\nAVG of NMAE = {np.mean(tab_nmae)}")



epoch 0  | loss: 203345.26562| val_0_mae: 402.7505| val_1_mae: 422.28811|  0:00:00s
epoch 100| loss: 39135.34375| val_0_mae: 354.7546| val_1_mae: 369.82226|  0:00:23s
epoch 200| loss: 2019.44006| val_0_mae: 121.28349| val_1_mae: 124.46096|  0:00:48s
epoch 300| loss: 2294.80908| val_0_mae: 62.30393| val_1_mae: 64.05749|  0:01:11s
epoch 400| loss: 1717.67786| val_0_mae: 42.46077| val_1_mae: 45.82905|  0:01:39s
epoch 500| loss: 1609.23145| val_0_mae: 32.93927| val_1_mae: 36.49594|  0:02:01s
epoch 600| loss: 1712.651| val_0_mae: 30.76131| val_1_mae: 34.77767|  0:02:24s
epoch 700| loss: 1515.26062| val_0_mae: 32.61277| val_1_mae: 37.2843 |  0:02:47s

Early stopping occurred at epoch 759 with best_epoch = 659 and best_val_1_mae = 33.82723




1 Fold NMAE = 0.0801673056814221




epoch 0  | loss: 211312.4375| val_0_mae: 402.53396| val_1_mae: 392.62421|  0:00:00s
epoch 100| loss: 36723.79297| val_0_mae: 347.7819| val_1_mae: 342.34951|  0:00:21s
epoch 200| loss: 2061.04175| val_0_mae: 134.3287| val_1_mae: 131.72351|  0:00:45s
epoch 300| loss: 1722.75049| val_0_mae: 62.4487 | val_1_mae: 62.02265|  0:01:07s
epoch 400| loss: 1681.07629| val_0_mae: 34.57418| val_1_mae: 35.25857|  0:01:28s
epoch 500| loss: 1478.57336| val_0_mae: 32.43366| val_1_mae: 34.6181 |  0:01:50s

Early stopping occurred at epoch 590 with best_epoch = 490 and best_val_1_mae = 32.80224
2 Fold NMAE = 0.08243247081546284




epoch 0  | loss: 215585.65625| val_0_mae: 407.09006| val_1_mae: 406.43246|  0:00:00s
epoch 100| loss: 53010.82031| val_0_mae: 350.42987| val_1_mae: 348.85384|  0:00:22s
epoch 200| loss: 2175.54956| val_0_mae: 133.11708| val_1_mae: 132.2091|  0:00:45s
epoch 300| loss: 1871.99731| val_0_mae: 54.58124| val_1_mae: 54.33198|  0:01:10s
epoch 400| loss: 2183.92358| val_0_mae: 40.19396| val_1_mae: 42.38471|  0:01:30s
epoch 500| loss: 1629.64856| val_0_mae: 35.70734| val_1_mae: 36.06475|  0:01:51s
epoch 600| loss: 1973.84399| val_0_mae: 30.39876| val_1_mae: 32.9873 |  0:02:11s

Early stopping occurred at epoch 659 with best_epoch = 559 and best_val_1_mae = 31.38987
3 Fold NMAE = 0.07754606987753823




epoch 0  | loss: 214145.21875| val_0_mae: 403.4314| val_1_mae: 392.13838|  0:00:00s
epoch 100| loss: 43178.80469| val_0_mae: 361.10901| val_1_mae: 355.18266|  0:00:20s
epoch 200| loss: 2109.64062| val_0_mae: 126.98974| val_1_mae: 126.92378|  0:00:40s
epoch 300| loss: 2040.71375| val_0_mae: 57.7657 | val_1_mae: 57.37012|  0:01:01s
epoch 400| loss: 2035.45044| val_0_mae: 38.05   | val_1_mae: 37.93187|  0:01:24s
epoch 500| loss: 1849.74268| val_0_mae: 33.43479| val_1_mae: 34.17388|  0:01:46s
epoch 600| loss: 1827.00146| val_0_mae: 34.3097 | val_1_mae: 35.55346|  0:02:07s
epoch 700| loss: 1767.39392| val_0_mae: 36.47344| val_1_mae: 36.82667|  0:02:28s
epoch 800| loss: 1718.22559| val_0_mae: 39.00296| val_1_mae: 39.54649|  0:02:48s

Early stopping occurred at epoch 829 with best_epoch = 729 and best_val_1_mae = 32.97515
4 Fold NMAE = 0.08299847028470969




epoch 0  | loss: 210957.1875| val_0_mae: 400.54213| val_1_mae: 402.64625|  0:00:00s
epoch 100| loss: 36372.99219| val_0_mae: 347.07001| val_1_mae: 347.60476|  0:00:20s
epoch 200| loss: 2015.81348| val_0_mae: 153.96324| val_1_mae: 150.5215|  0:00:42s
epoch 300| loss: 1804.32593| val_0_mae: 60.16805| val_1_mae: 60.83061|  0:01:02s
epoch 400| loss: 1538.42896| val_0_mae: 38.1356 | val_1_mae: 38.34777|  0:01:22s
epoch 500| loss: 1620.7533| val_0_mae: 34.36039| val_1_mae: 35.87216|  0:01:42s
epoch 600| loss: 1507.26331| val_0_mae: 34.45522| val_1_mae: 37.59365|  0:02:01s

Early stopping occurred at epoch 656 with best_epoch = 556 and best_val_1_mae = 33.16234
5 Fold NMAE = 0.08147122909996443





AVG of NMAE = 0.08092310915181947


In [13]:
tab_pred = np.mean(tab_pred,axis = 0)
tab_pred = pd.Series(tab_pred.flatten())
tab_pred = tab_pred.to_numpy()

### XGBRegressor

In [14]:
sampler = TPESampler()

In [15]:
def objective(trial):
    kf = KFold(n_splits = 5, random_state = 42, shuffle = True)
    #train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3,random_state=42)
    param = {
        'lambda': trial.suggest_float('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_float('alpha', 1e-3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'learning_rate': trial.suggest_float('learning_rate',0.01, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
        'max_depth': trial.suggest_int('max_depth', 4,24),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 50),
    }
    model = XGBRegressor(**param)  
    """
    base optuna searching method
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100, eval_metric='mae')
    preds = model.predict(test_x).astype(int)
    nmae = NMAE(test_y, preds)
    return nmae
    """
    #cross_val optuna searching method
    score = cross_val_score(model, X, y, cv = kf, scoring = NMAE_CV).mean()
    return score
          
study_xgb = optuna.create_study(
    direction='minimize',
    study_name = 'Xgboost Optuna', 
    sampler=sampler
)
study_xgb.optimize(objective, n_trials=100)

[32m[I 2022-12-13 19:32:37,154][0m A new study created in memory with name: Xgboost Optuna[0m
[32m[I 2022-12-13 19:33:05,507][0m Trial 0 finished with value: 0.10288669568080196 and parameters: {'lambda': 0.0752451513820885, 'alpha': 0.11734346787248895, 'colsample_bytree': 0.43510036347009773, 'subsample': 0.48182687641445304, 'learning_rate': 0.46572857368142373, 'n_estimators': 1071, 'max_depth': 14, 'min_child_weight': 26}. Best is trial 0 with value: 0.10288669568080196.[0m
[32m[I 2022-12-13 19:33:46,856][0m Trial 1 finished with value: 0.08128326406739081 and parameters: {'lambda': 0.09256396974955594, 'alpha': 0.8261306636288646, 'colsample_bytree': 0.6687992860024254, 'subsample': 0.8917307349708219, 'learning_rate': 0.042634682404854074, 'n_estimators': 1370, 'max_depth': 10, 'min_child_weight': 33}. Best is trial 1 with value: 0.08128326406739081.[0m
[32m[I 2022-12-13 19:35:17,669][0m Trial 2 finished with value: 0.09885929733414592 and parameters: {'lambda': 0.033

[32m[I 2022-12-13 19:52:49,274][0m Trial 21 finished with value: 0.07683371747786825 and parameters: {'lambda': 0.0011479204473983325, 'alpha': 0.923626753858181, 'colsample_bytree': 0.7297822863365432, 'subsample': 0.6664354665636354, 'learning_rate': 0.03082939224895049, 'n_estimators': 1746, 'max_depth': 4, 'min_child_weight': 50}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 19:53:33,809][0m Trial 22 finished with value: 0.0769923233100939 and parameters: {'lambda': 0.02651218846661548, 'alpha': 0.9164963243278492, 'colsample_bytree': 0.6981492712331444, 'subsample': 0.7361948719424471, 'learning_rate': 0.01657588032965461, 'n_estimators': 2097, 'max_depth': 6, 'min_child_weight': 38}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 19:54:03,821][0m Trial 23 finished with value: 0.08102033503460922 and parameters: {'lambda': 0.005963385383161409, 'alpha': 0.7584080865341396, 'colsample_bytree': 0.6345461971363299, 'subsample'

[32m[I 2022-12-13 20:08:23,367][0m Trial 42 finished with value: 0.08083620087835144 and parameters: {'lambda': 0.013371778363509224, 'alpha': 0.9384853558279184, 'colsample_bytree': 0.814315156699654, 'subsample': 0.4926063233642244, 'learning_rate': 0.06594993742313962, 'n_estimators': 1664, 'max_depth': 7, 'min_child_weight': 47}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 20:09:18,739][0m Trial 43 finished with value: 0.07974143338236248 and parameters: {'lambda': 0.031257170110686155, 'alpha': 0.8782946474267448, 'colsample_bytree': 0.7644167478421445, 'subsample': 0.4407496026129335, 'learning_rate': 0.03720913564734785, 'n_estimators': 1883, 'max_depth': 10, 'min_child_weight': 40}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 20:10:29,842][0m Trial 44 finished with value: 0.09007283479062292 and parameters: {'lambda': 0.0034644571201854235, 'alpha': 0.9537572462533882, 'colsample_bytree': 0.8572727184983976, 'subsamp

[32m[I 2022-12-13 20:27:05,155][0m Trial 63 finished with value: 0.08045226979483422 and parameters: {'lambda': 0.014572522322529896, 'alpha': 0.963894416745883, 'colsample_bytree': 0.7679600679381183, 'subsample': 0.5881181819013023, 'learning_rate': 0.044801949251760884, 'n_estimators': 1662, 'max_depth': 11, 'min_child_weight': 49}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 20:27:53,612][0m Trial 64 finished with value: 0.08170304920803709 and parameters: {'lambda': 0.025000471285932465, 'alpha': 0.9574678243632119, 'colsample_bytree': 0.9053744456930848, 'subsample': 0.5442904491230743, 'learning_rate': 0.06042857515795302, 'n_estimators': 1470, 'max_depth': 12, 'min_child_weight': 46}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 20:28:40,344][0m Trial 65 finished with value: 0.0762178386452755 and parameters: {'lambda': 0.01765326310538638, 'alpha': 0.9022170072688273, 'colsample_bytree': 0.8356165580389304, 'subsampl

[32m[I 2022-12-13 20:44:00,571][0m Trial 84 finished with value: 0.08255966516239441 and parameters: {'lambda': 0.020653565697852004, 'alpha': 0.9757466530729391, 'colsample_bytree': 0.7580269668308595, 'subsample': 0.752450579833337, 'learning_rate': 0.039830319317035165, 'n_estimators': 1917, 'max_depth': 10, 'min_child_weight': 37}. Best is trial 10 with value: 0.07515195691174334.[0m
[32m[I 2022-12-13 20:44:43,939][0m Trial 85 finished with value: 0.07793593731634205 and parameters: {'lambda': 0.009555414789189554, 'alpha': 0.8143456759751999, 'colsample_bytree': 0.7387846062401654, 'subsample': 0.6557367657652059, 'learning_rate': 0.023409113214057135, 'n_estimators': 1569, 'max_depth': 9, 'min_child_weight': 49}. Best is trial 10 with value: 0.07515195691174334.[0m


KeyboardInterrupt: 

In [None]:
study_xgb.best_params

In [None]:
xgb_param = {
    'objective' : 'reg:squarederror',
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor'
}
xgb_best_params = study_xgb.best_params.copy()
xgb_best_params.update(xgb_param)
xgb_best_params

In [None]:
#multi-kfold
"""
import xgboost as xgb
def custom_NMAE(y_pred, Dmatrix):
    true = Dmatrix.get_label()
    mae = np.mean(np.abs(true - y_pred))
    score = mae / np.mean(np.abs(true))
    return ('custom_NMAE', score)
"""
xgb_pred = []

kfold_list = [2, 3, 4, 5, 6, 10, 20]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    xgb_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
        """
        파이썬 래퍼 XGB 데이터 셋 변환
        train = xgb.DMatrix(tr_x, label=tr_y)
        valid = xgb.DMatrix(val_x, label=val_y)
        test = xgb.DMatrix(X_test)
        """
        #사이킷 런 래퍼 XGB 학습
        xgb = XGBRegressor(**xgb_best_params)
        xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50, eval_metric = 'mae')       
        val_pred = xgb.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        xgb_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = xgb.predict(X_test)
        xgb_pred.append(fold_pred)
        
        """
        파이썬 래퍼 XGB 학습
        model = xgb.train(**xgb_best_params, train, 
                          num_boost_round = 1000, 
                          early_stopping_rounds = 100,
                          verbose = 50,
                          evals=[(valid, 'valid')], 
                          feval=custom_NMAE)
        """

    print(f"\nAVG of NMAE = {np.mean(xgb_nmae)}")

In [None]:
xgb_pred_sum = sum(xgb_pred)  
xgb_pred_sum /= len(xgb_pred)
xgb_pred_sum

### LGBMRegressor

In [None]:
def objective(trial):
    kf = KFold(n_splits = 5, random_state = 42, shuffle = True)
    #train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3,random_state=42)
    param = {'num_leaves': trial.suggest_int('num_leaves', 10, 400), 
            'max_depth': trial.suggest_int('max_depth', 4, 24), 
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5), 
            'n_estimators': trial.suggest_int('n_estimators', 1000, 3000), 
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 50), 
            'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0),
            'random_state': 42}
    model =LGBMRegressor(**param)  
    
    """
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,eval_metric='mae')
    preds = model.predict(test_x).astype(int)
    nmae = NMAE(test_y, preds)
    return nmae
    """
    
    score = cross_val_score(model, X, y, cv = kf, scoring = NMAE_CV).mean()
    return score
          
study_lgb = optuna.create_study(
    direction='minimize',
    study_name = 'LGBM Optuna', 
    sampler=sampler
)
study_lgb.optimize(objective, n_trials=100)

In [None]:
study_lgb.best_params

In [None]:
lgb_param = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae',
}
lgb_best_params = study_lgb.best_params.copy()
lgb_best_params.update(lgb_param)
lgb_best_params

In [None]:
#multi-kfold
lgb_pred = []

kfold_list = [2, 3, 4, 5, 6, 10, 20]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    lgb_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        lgb = LGBMRegressor(**lgb_param)
        lgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50, eval_metric = 'mae')
        val_pred = lgb.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        lgb_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = lgb.predict(X_test)
        lgb_pred.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(lgb_nmae)}")

In [None]:
lgb_pred_sum = sum(lgb_pred)  
lgb_pred_sum /= len(lgb_pred)
lgb_pred_sum

### CatboostRegressor

In [None]:
#multi-kfold
cat_pred = []

kfold_list = [2, 3, 4, 5, 6, 10, 20]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    cat_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X)):
        tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

        cat = CatBoostRegressor(max_depth = 4, learning_rate = 0.01, use_best_model = True, iterations = 3000, eval_metric = 'MAE')
        cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50)
        val_pred = cat.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        cat_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = cat.predict(X_test)
        cat_pred.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(cat_nmae)}")

In [None]:
cat_pred_sum = sum(cat_pred)  
cat_pred_sum /= len(cat_pred)
cat_pred_sum

## Ensemble

In [None]:
# submission['착과량(int)'] = xgb_pred_sum*0.5 + lgb_pred_sum*0.3 + cat_pred_sum*0.2
#submission['착과량(int)'] = np.round(submission['착과량(int)']) #정수화

In [None]:
# submission

### Submission

In [None]:
# submission.to_csv('./multi_kfold2.csv', index=False)