# 제출용 메인 코드

# 데이터 확인 및 전처리
- 결측 및 이상치 확인
- Train의 X, y 변수 시각화
- 상관관계 확인
- 특징(클러스터링, PCA, K-means, 도메인 이용 등..) 추출

# 다중 회귀 예측 모델링
- 타겟변수 그룹별 모델링
- 타겟변수 별 특정 서브 모델
- 선형 / 비선형 / 트리 및 앙상블 / 신경망
- 자체 성능 평가

# 성능 개선 작업
- 주요 특징 선택(유전알고리즘, 변수중요도, 라쏘 등..)
- 최적화(그리드, 베이지안, 하이퍼밴드)
- 과적합 방지(일반화)

In [56]:

# Library
import os
import sys
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

# modeling
import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LassoCV, Lasso, LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgbm
from lightgbm.sklearn import LGBMRegressor

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Fix Seed 
def seed_everything(seed):
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [2]:
# Load Data
train_df = pd.read_csv('./open/train.csv')
train_x = train_df.filter(regex="X")
train_y = train_df.filter(regex="Y")

test = pd.read_csv("./open/test.csv").drop(columns='ID')

In [3]:
# 무의미 변수 제거(표준편차 0, 데이터 변동 없음)
# X_04, 23, 47, 48
X_tr_std = pd.Series(np.std(train_x))
Del_idx = X_tr_std[X_tr_std==0].index

train_x = train_x.drop(Del_idx, axis=1)
test = test.drop(Del_idx, axis=1)

train_x.shape, test.shape

((39607, 52), (39608, 52))

In [93]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2)
X_train.shape, X_val.shape

((31685, 52), (7922, 52))

In [94]:
y_train = y_train['Y_04']
y_val = y_val['Y_04']

y_val

28103    11.355
31300    14.553
9743     17.256
12914    11.235
29741    14.904
          ...  
32215    13.205
7973     15.225
21044    12.902
20162    13.733
1784     17.801
Name: Y_04, Length: 7922, dtype: float64

In [28]:
# 대회 평가 산식
def lg_nrmse(true, pred):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(true[:,idx], pred[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(true[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    
    return score

In [29]:
## 모델 학습 및 score 출력 ##
def train_score(model, x_train, y_train, x_test, y_test):
    lgb_model = model.fit(x_train,y_train)
    pred = lgb_model.predict(x_test)
    rmse = lg_nrmse(y_test,pred)
    return rmse

In [96]:
def RMSE(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [81]:
##파라미터 튜닝 범위
hyper_space = {'n_estimators': hp.choice('n_estimators',np.arange(50, 500, 50, dtype=int)),
               'learning_rate': hp.quniform('learning_rate', 0.01, 0.1, 0.005),
               'max_depth': hp.choice('max_depth',np.arange(-1, 15, 1, dtype=int)),
               'min_child_weight': hp.choice('min_child_weight',np.arange(1, 10, 1, dtype=int)),
               'num_leaves': hp.choice('num_leaves',np.arange(16, 1024, 16, dtype=int))
              }

In [97]:
def hyper_tuning(hyper_space):
  # Initilize instance of estimator
  model = LGBMRegressor(**hyper_space)
  
  evaluation = [(X_train, y_train), (X_val, y_val)]
  
  model.fit(X_train, y_train,
            eval_set = evaluation,
            eval_metric='rmse',
            early_stopping_rounds=30,
            verbose=1)
  #score
  pred = model.predict(X_val)
  rmse = RMSE(y_val, pred)
  
  return {'loss':rmse, 'status':STATUS_OK, 'model':model}

In [98]:
# Trials 객체 선언
trials = Trials()

# best에 최적 하이퍼 파라미터를 return
best = fmin(fn = hyper_tuning,
            space=hyper_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

best['max_depth'] = int(best['max_depth'])
best['min_child_weight'] = int(best['min_child_weight'])
best['n_estimators'] = int(best['n_estimators'])
best['reg_alpha'] = reg_candidate[int(best['reg_alpha'])]
best['reg_lambda'] = reg_candidate[int(best['reg_lambda'])]
best['random_state'] = 42
print (best)


100%|██████████| 50/50 [00:18<00:00,  2.68trial/s, best loss: 2.738781760793041] 
{'learning_rate': 0.08, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 8, 'reg_alpha': 10, 'reg_lambda': 100, 'random_state': 42}


In [99]:
# best parameters
lgb = lgbm.LGBMRegressor(**best)
lgb.fit(X_train, y_train)
pred = lgb.predict(X_val)
print('Done.')

print(RMSE(y_val, pred))

Done.
2.7965626533051067


In [None]:
lg_nrmse(np.array(y_val), preds)

In [79]:
mean_squared_error(np.expm1(y_val).iloc[:,0], np.expm1(preds)[:,0], squared=False)

0.3505521957207792

In [76]:
print(mean_squared_error(np.array(y_val)[:,0], preds[:,0], squared=False),
mean_squared_error(np.array(y_val)[:,1], preds[:,1], squared=False),
mean_squared_error(np.array(y_val)[:,2], preds[:,2], squared=False))

0.15207048672261406 0.18641542617028176 0.17866352571495792


In [19]:
# predictions for submission
final_preds = Model.predict(test)
print('Done.')

Done.


In [20]:
submit = pd.read_csv('./open/sample_submission.csv')

for id, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = final_preds[:,id-1]
print('Done.')

Done.


In [21]:
submit.to_csv('./submit_3.csv',index=False)