## 라이브러리 불러오기

In [1]:
import sys
import pandas as pd
import numpy as np

from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor

from pycaret.regression import * 
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, mean_squared_error

from scipy import stats
from scipy.stats import norm, skew

from statsmodels.formula.api import ols

import matplotlib.pyplot as plt
import seaborn as sns

## Experiment 1
- 실험 내용 : 원본 데이터로 학습시키기

In [2]:
train = pd.read_csv('C:/Users/cdbre/Desktop/MyLife/DACON/data/teacher_train.csv')

In [3]:
train = train[['ANONYMOUS_1','YEAR','FE','ANONYMOUS_2','ZN','V40','PQINDEX','NI','CU','MO','CR','Y_LABEL','Score_0']]

In [4]:
train.head()

Unnamed: 0,ANONYMOUS_1,YEAR,FE,ANONYMOUS_2,ZN,V40,PQINDEX,NI,CU,MO,CR,Y_LABEL,Score_0
0,1486,2011,888,200,75,154.0,8504,6,78,1,13,0,0.9949
1,1350,2021,2,375,652,44.0,19,0,31,0,0,0,0.9958
2,2415,2015,4,200,412,72.6,17,0,2,0,1,1,0.0017
3,7389,2010,37,200,7,133.3,44,0,1,0,0,0,0.7446
4,3954,2015,71,200,128,133.1,217,0,0,0,0,0,0.9915


In [6]:
train.describe()[['Score_0']]

Unnamed: 0,Score_0
count,14095.0
mean,0.914513
std,0.220244
min,0.0005
25%,0.9643
50%,0.9868
75%,0.9941
max,0.9996


In [6]:
# train, test로 분리
train, test = train_test_split(train, test_size=0.2, stratify=train.Y_LABEL, random_state=42)

In [7]:
print(train.shape)
print(test.shape)

(11276, 13)
(2819, 13)


In [5]:
real_test = pd.read_csv('C:/Users/cdbre/Desktop/MyLife/DACON/data/test.csv')

In [7]:
real_test = real_test[['ANONYMOUS_1','YEAR','FE','ANONYMOUS_2','ZN','V40','PQINDEX','NI','CU','MO','CR']]

## Metric
- MSE 활용

## AutoML

In [8]:
clf = setup(data = train,  
            target = 'Score_0',  # 타겟 데이터 설정
            ignore_features = ['Y_LABEL'],
            ordinal_features = {'YEAR' : ['2007','2008','2009','2010','2011','2012','2013','2014',
                                          '2015','2016','2017','2018','2019','2020','2021','2022']},  # YEAR 칼럼은 순서형
            numeric_features = ['ANONYMOUS_1', 'FE','ANONYMOUS_2', 'ZN', 'V40', 'PQINDEX', 'NI', 'CU', 'MO', 'CR'], 
            use_gpu = True,
            normalize=True, normalize_method = 'robust',  # 수치형 변수는 이상치가 많아 Robust Scaling 적용
            session_id = 42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,Score_0
2,Original Data,"(11276, 13)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,1
6,Ordinal Features,True
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(7893, 11)"


In [9]:
best_model = compare_models(sort='MSE')  # F1 Score 기준으로 정렬

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.1082,0.0445,0.2107,0.0692,0.1425,24.7027,0.894
catboost,CatBoost Regressor,0.1096,0.0451,0.2121,0.0563,0.1427,24.578,2.195
lightgbm,Light Gradient Boosting Machine,0.1093,0.0454,0.2128,0.0501,0.1431,24.6483,1.064
rf,Random Forest Regressor,0.116,0.0465,0.2153,0.0259,0.1438,24.0434,1.2
lr,Linear Regression,0.1137,0.0469,0.2162,0.021,0.1462,25.8184,0.018
ridge,Ridge Regression,0.1137,0.0469,0.2162,0.021,0.1462,25.8184,0.008
lar,Least Angle Regression,0.1137,0.0469,0.2162,0.021,0.1462,25.8184,0.01
br,Bayesian Ridge,0.1138,0.0469,0.2162,0.0211,0.1462,25.8489,0.009
et,Extra Trees Regressor,0.1172,0.0474,0.2174,0.0078,0.1446,24.0471,1.051
omp,Orthogonal Matching Pursuit,0.115,0.0474,0.2174,0.0109,0.1469,26.0644,0.007


In [10]:
gbr = create_model('gbr', cross_validation = False)
cat = create_model('catboost', cross_validation = False)
lgbm = create_model('lightgbm', cross_validation = False)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.1138,0.0504,0.2245,0.064,0.1517,29.1192


In [12]:
print(gbr, '\n')
print(cat.get_all_params(), '\n')
print(lgbm)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False) 

{'nan_mode': 'Min', 'eval_metric': 'RMSE', 'iterations': 1000, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 

In [13]:
# GBR 파라미터 튜닝
tuned_gbr = tune_model(gbr, optimize='MSE', n_iter=10, search_library='optuna', choose_better=True)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1148,0.0484,0.2201,0.0709,0.1499,30.2011
1,0.1069,0.0402,0.2005,0.0799,0.1357,22.6723
2,0.114,0.0459,0.2142,0.0465,0.1445,22.4264
3,0.1118,0.046,0.2146,0.0731,0.145,21.258
4,0.1133,0.0485,0.2203,0.0706,0.1513,29.1974
5,0.0994,0.0356,0.1887,0.0539,0.1279,22.7183
6,0.1127,0.0466,0.2158,0.0327,0.1464,32.4184
7,0.1157,0.0496,0.2228,0.0873,0.1524,25.8567
8,0.1158,0.0496,0.2227,0.0892,0.1526,30.5003
9,0.1018,0.0367,0.1916,0.0555,0.1285,16.4747


In [14]:
print(gbr, '\n')
print(tuned_gbr)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False) 

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_sa

In [15]:
# Cat 파라미터 튜닝
tuned_cat = tune_model(cat, optimize='MSE', n_iter=10, search_library='optuna', choose_better=True)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1121,0.0481,0.2193,0.0779,0.1482,29.3897
1,0.1025,0.0389,0.1973,0.1092,0.1333,21.951
2,0.1137,0.0468,0.2163,0.0283,0.1449,22.0325
3,0.106,0.0445,0.2109,0.1044,0.1418,20.3612
4,0.108,0.0473,0.2175,0.0936,0.1493,28.6591
5,0.098,0.0362,0.1903,0.0386,0.1281,22.1305
6,0.1111,0.047,0.2169,0.0233,0.1465,31.7139
7,0.1093,0.0469,0.2166,0.1372,0.1476,24.8226
8,0.1136,0.0489,0.2211,0.1026,0.1502,29.1467
9,0.0973,0.0364,0.1909,0.0625,0.1277,16.2634


In [16]:
print(cat.get_all_params(), '\n')
print(tuned_cat.get_all_params())

{'nan_mode': 'Min', 'eval_metric': 'RMSE', 'iterations': 1000, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'eval_fraction': 0, 'force_unit_auto_pair_weights': False, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': True, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': False, 'random_seed': 42, 'depth': 6, 'posterior_sampling': False, 'border_count': 254, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtracking': 'AnyImprovement', 'best_model_min_trees': 1, 'model_shrink_rate': 0, 'min_data_in_leaf': 1, 'loss_function': 'RMSE', 'learning_rate': 0.05674700066447258, 'score_function': 'Cosine', 'task_type': 'CPU', 'lea

In [17]:
# LGBM 파라미터 튜닝
tuned_lgbm = tune_model(lgbm, optimize='MSE', n_iter=10, search_library='optuna', choose_better=True)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.1157,0.0501,0.2237,0.0401,0.1517,30.1733
1,0.1042,0.0403,0.2008,0.0772,0.1357,22.3846
2,0.1135,0.0467,0.2162,0.0292,0.1452,22.1771
3,0.1114,0.0473,0.2175,0.0476,0.1465,21.2682
4,0.1116,0.048,0.219,0.0812,0.1502,28.7484
5,0.1006,0.0361,0.19,0.0414,0.1279,22.0567
6,0.1119,0.0468,0.2162,0.0289,0.1462,31.7328
7,0.1159,0.0495,0.2225,0.0894,0.1518,25.4753
8,0.1143,0.0501,0.2239,0.0794,0.153,30.1921
9,0.101,0.0363,0.1905,0.0664,0.1274,16.1593


In [18]:
print(lgbm, '\n')
print(tuned_lgbm)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              device='gpu', importance_type='split', learning_rate=0.1,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31,
              objective=None, random_state=42, reg_alpha=0.0, reg_lambda=0.0,
              silent='warn', subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0) 

LGBMRegressor(bagging_fraction=0.7148538589793427, bagging_freq=3,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              device='gpu', feature_fraction=0.5825453457757226,
              importance_type='split', learning_rate=0.3368892510399297,
              max_depth=-1, min_child_samples=30, min_child_weight=0.001,
              min_split_gain=0.21233911067827616, n_estimators=252, n_jobs=-1,
              num_leaves=7, objective=None, random_state=42,
              reg_alp

In [12]:
real_train = train.copy()

In [13]:
train.drop(['Y_LABEL'], axis=1, inplace=True)

In [18]:
train_X = train.drop(['Score_0'], axis=1)
train_y = train['Score_0']

In [15]:
# YEAR
le = LabelEncoder()
train.YEAR = le.fit_transform(train.YEAR)
real_test.YEAR = le.fit_transform(real_test.YEAR)

In [24]:
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in ['YEAR']:
        scaler = RobustScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        real_test[col] = scaler.transform(get_values(real_test[col]))

In [25]:
train_X

Unnamed: 0,ANONYMOUS_1,YEAR,FE,ANONYMOUS_2,ZN,V40,PQINDEX,NI,CU,MO,CR
0,-0.345938,4,6.776,0.000000,-0.411275,0.652905,50.147929,6.0,4.6250,0.000000,4.000000
1,-0.409430,14,-0.312,0.833333,0.121996,-1.029052,-0.059172,0.0,1.6875,-0.142857,-0.333333
2,0.087768,8,-0.296,0.000000,-0.099815,-0.591743,-0.071006,0.0,-0.1250,-0.142857,0.000000
3,2.409897,3,-0.032,0.000000,-0.474122,0.336391,0.088757,0.0,-0.1875,-0.142857,-0.333333
4,0.806256,8,0.240,0.000000,-0.362292,0.333333,1.112426,0.0,-0.2500,-0.142857,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...
14090,-0.285247,7,-0.144,0.000000,-0.465804,0.368502,0.035503,0.0,-0.0625,-0.142857,-0.333333
14091,0.260037,6,-0.232,0.000000,0.820702,0.094801,-0.118343,0.0,-0.1250,31.857143,-0.333333
14092,-0.204949,1,2.992,1.666667,0.721811,-0.876147,3.644970,1.0,0.1875,1.285714,1.000000
14093,0.126517,2,-0.176,1.666667,0.121996,-1.024465,-0.106509,0.0,10.3750,0.000000,-0.333333


In [38]:
clf_cat = CatBoostRegressor(nan_mode='Min', eval_metric='RMSE', iterations=102, sampling_frequency='PerTree',
                            leaf_estimation_method='Newton', grow_policy='SymmetricTree', penalties_coefficient=1,
                            boosting_type='Plain', model_shrink_mode='Constant', feature_border_type='GreedyLogSum', 
                            eval_fraction=0, l2_leaf_reg=4, random_strength=0.7446107864379883, rsm=1, boost_from_average=True, 
                            model_size_reg=0.5, subsample=0.800000011920929, use_best_model=False, random_seed=42, 
                            depth=8, posterior_sampling=False, border_count=254, sparse_features_conflict_fraction=0, 
                            leaf_estimation_backtracking='AnyImprovement', best_model_min_trees=1, model_shrink_rate=0, 
                            min_data_in_leaf=1, loss_function='RMSE', learning_rate=0.1271568387746811, score_function='Cosine', 
                            task_type='CPU', leaf_estimation_iterations=1, bootstrap_type='MVS', max_leaves=256)

clf_gbr = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',init=None, learning_rate=0.1, 
                                    loss='ls', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, 
                                    min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100,
                                    n_iter_no_change=None, random_state=42, subsample=1.0, tol=0.0001, validation_fraction=0.1,
                                    verbose=0, warm_start=False)

clf_lgb = lgb.LGBMRegressor(bagging_fraction=0.7148538589793427, bagging_freq=3,boosting_type='gbdt', class_weight=None, 
                            colsample_bytree=1.0, device='gpu', feature_fraction=0.5825453457757226, importance_type='split',
                            learning_rate=0.3368892510399297, max_depth=-1, min_child_samples=30, min_child_weight=0.001,
                            min_split_gain=0.21233911067827616, n_estimators=252, n_jobs=-1, num_leaves=7, objective=None, 
                            random_state=42, reg_alpha=1.0001718780584612e-08, reg_lambda=1.040997312682199e-08, silent='warn',
                            subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [39]:
clf_cat.fit(train_X, train_y)
clf_gbr.fit(train_X, train_y)
clf_lgb.fit(train_X, train_y)

0:	learn: 0.2183755	total: 148ms	remaining: 14.9s
1:	learn: 0.2169114	total: 153ms	remaining: 7.66s
2:	learn: 0.2156178	total: 159ms	remaining: 5.24s
3:	learn: 0.2146855	total: 164ms	remaining: 4.03s
4:	learn: 0.2137734	total: 170ms	remaining: 3.29s
5:	learn: 0.2130940	total: 175ms	remaining: 2.8s
6:	learn: 0.2123636	total: 180ms	remaining: 2.45s
7:	learn: 0.2116038	total: 186ms	remaining: 2.18s
8:	learn: 0.2109342	total: 190ms	remaining: 1.96s
9:	learn: 0.2104393	total: 195ms	remaining: 1.79s
10:	learn: 0.2099574	total: 199ms	remaining: 1.65s
11:	learn: 0.2094465	total: 205ms	remaining: 1.54s
12:	learn: 0.2089618	total: 211ms	remaining: 1.44s
13:	learn: 0.2085669	total: 217ms	remaining: 1.36s
14:	learn: 0.2083312	total: 222ms	remaining: 1.29s
15:	learn: 0.2079585	total: 228ms	remaining: 1.23s
16:	learn: 0.2076622	total: 234ms	remaining: 1.17s
17:	learn: 0.2073677	total: 240ms	remaining: 1.12s
18:	learn: 0.2070764	total: 246ms	remaining: 1.07s
19:	learn: 0.2067978	total: 252ms	remainin

In [41]:
pred_cat = clf_cat.predict(real_test)
pred_gbr = clf_gbr.predict(real_test)
pred_lgb = clf_lgb.predict(real_test)

In [47]:
pred = pd.DataFrame({'cat': pred_cat, 'gbr': pred_gbr, 'lgb': pred_lgb})
pred

Unnamed: 0,cat,gbr,lgb
0,0.942207,0.940200,0.942598
1,0.922106,0.881264,0.916034
2,0.883490,0.858878,0.878191
3,0.887884,0.912664,0.885946
4,0.821230,0.832220,0.826438
...,...,...,...
6036,0.964216,0.943723,0.962968
6037,0.920347,0.918789,0.914226
6038,0.876644,0.868024,0.866024
6039,0.938225,0.944663,0.941373


In [49]:
pred['mean'] = pred.mean(axis=1)
pred

Unnamed: 0,cat,gbr,lgb,mean
0,0.942207,0.940200,0.942598,0.941668
1,0.922106,0.881264,0.916034,0.906468
2,0.883490,0.858878,0.878191,0.873520
3,0.887884,0.912664,0.885946,0.895498
4,0.821230,0.832220,0.826438,0.826629
...,...,...,...,...
6036,0.964216,0.943723,0.962968,0.956969
6037,0.920347,0.918789,0.914226,0.917787
6038,0.876644,0.868024,0.866024,0.870231
6039,0.938225,0.944663,0.941373,0.941420


In [66]:
def label(score):
    return 0 if score > 0.85 else 1

In [67]:
labels = [label(score) for score in pred['mean']]

In [68]:
pd.DataFrame(labels).value_counts()

0    5423
1     618
dtype: int64

In [69]:
submit = pd.read_csv('C:/Users/cdbre/Desktop/MyLife/DACON/data/sample_submission.csv')
submit['Y_LABEL'] = labels
submit.head()

Unnamed: 0,ID,Y_LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,1


In [70]:
submit.to_csv('C:/Users/cdbre/Desktop/MyLife/DACON/data/submit_fuck.csv', index=False)