# 데이터 확인 및 전처리
- 결측 및 이상치 확인
- Train의 X, y 변수 시각화
- 상관관계 확인
- 특징(클러스터링, PCA, K-means, 도메인 이용 등..) 추출

# 다중 회귀 예측 모델링
- 타겟변수 그룹별 모델링
- 타겟변수 별 특정 서브 모델
- 선형 / 비선형 / 트리 및 앙상블 / 신경망
- 자체 성능 평가

# 성능 개선 작업
- 주요 특징 선택(유전알고리즘, 변수중요도, 라쏘 등..)
- 최적화(그리드, 베이지안, 하이퍼밴드)
- 과적합 방지(일반화)

In [29]:

# Library
import os
import sys
import numpy as np
import pandas as pd
np.random.seed(55)
os.environ['PYTHONHASHSEED'] = str(55)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LassoCV, Lasso, LinearRegression
from sklearn.multioutput import RegressorChain
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold

import warnings
warnings.filterwarnings('ignore')



In [17]:
# Load Data
train_df = pd.read_csv('./open/train.csv')
train_x = train_df.filter(regex="X")
train_y = train_df.filter(regex="Y")

test = pd.read_csv("./open/test.csv").drop(columns='ID')

In [18]:
# 무의미 변수 제거(표준편차 0, 데이터 변동 없음)
# X_04, 23, 47, 48
X_tr_std = pd.Series(np.std(train_x))
Del_idx = X_tr_std[X_tr_std==0].index

train_x = train_x.drop(Del_idx, axis=1)
test = test.drop(Del_idx, axis=1)

train_x.shape, test.shape

In [20]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)

scaler.fit(test)
test_x = scaler.transform(test)

In [32]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2)

In [33]:
X_train.shape, X_val.shape

((31685, 52), (7922, 52))

In [21]:
LR = RegressorChain(LinearRegression()).fit(X_train, y_train)
print('Done.')

Done.


In [35]:
preds = LR.predict(X_val)
print('Done.')

Done.


In [56]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    
    for idx in range(0,14): # ignore 'ID'
        rmse = mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    
    return score

In [57]:
lg_nrmse(np.array(y_val), preds)

1.9412112530832923

In [54]:
mean_squared_error(np.array(y_val)[:,0], preds[:,0])

0.12321742554320692

In [40]:
y_val.iloc[0,:]

Y_01     1.599
Y_02     1.630
Y_03     1.419
Y_04    14.057
Y_05    31.797
Y_06    16.832
Y_07     3.372
Y_08   -26.063
Y_09   -26.234
Y_10   -21.858
Y_11    24.551
Y_12   -26.214
Y_13   -26.275
Y_14   -26.304
Name: 34539, dtype: float64

In [36]:
preds[0]

array([  1.31452732,   0.98553195,   0.94392472,  13.94011028,
        31.43871335,  16.5899187 ,   3.14440539, -26.3783108 ,
       -26.40587942, -22.27802306,  24.23350992, -26.32782841,
       -26.31894918, -26.32977016])

In [26]:
submit = pd.read_csv('./open/sample_submission.csv')

for id, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,id-1]
print('Done.')

Done.


In [27]:
submit.to_csv('./submit_2.csv',index=False)