In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import plot_importance
from xgboost import XGBClassifier


### Data Modeling

#### Normalization : MinMaxScaler

In [2]:
d_final = pd.read_csv('C://workspaces//AI//Final_PJT//final pjt//d_final.csv', encoding = 'utf-8')

In [3]:
d_final.head()

Unnamed: 0,id,sex,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,...,tobacco,contact_other_covid,covid_res,icu,DEATH,d_ent,d_sym,d_last,diff_days,age_grp
0,167386,1,2,2,54,2,2,2,2,2,...,2,3,1,2,0,2020-04-06,2020-04-01,2020-06-29,89,3.0
1,0b5948,2,2,1,30,2,2,2,2,2,...,2,3,1,2,0,2020-04-17,2020-04-10,2020-06-29,80,2.0
2,0d01b5,1,2,2,60,2,1,2,2,2,...,2,3,1,2,1,2020-04-13,2020-04-13,2020-04-22,9,4.0
3,1beec8,2,2,1,47,2,1,2,2,2,...,2,3,1,1,1,2020-04-16,2020-04-16,2020-04-29,13,3.0
4,1.75E+56,2,2,2,63,2,2,2,2,2,...,2,3,1,2,0,2020-04-22,2020-04-13,2020-06-29,77,4.0


In [4]:
# 정규화할 변수만 d_normal0에 저장
d_normal0 = d_final.loc[:, ['sex', 'intubed', 'pneumonia', 'age_grp', 'pregnancy', 'diabetes', 'copd', 'asthma', 'inmsupr',
                            'hypertension', 'other_disease', 'obesity', 'cardiovascular', 'renal_chronic', 
                            'tobacco', 'contact_other_covid', 'covid_res', 'icu']]

# y(종속변수)는 정규화 하지않고 따로 저장
d_y = d_final.loc[:, ['DEATH', 'diff_days']]

In [5]:
# 정규화
d_normal = MinMaxScaler(feature_range=(0,1)).fit_transform(d_normal0)
d_normal

array([[0. , 0.5, 0.5, ..., 1. , 0. , 0.5],
       [1. , 0.5, 0. , ..., 1. , 0. , 0.5],
       [0. , 0.5, 0.5, ..., 1. , 0. , 0.5],
       ...,
       [1. , 0. , 0. , ..., 0.5, 1. , 0. ],
       [0. , 0.5, 0. , ..., 1. , 1. , 0.5],
       [0. , 0.5, 0. , ..., 1. , 1. , 0.5]])

#### Train, Test set 분리

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(d_normal, d_y, shuffle = True, random_state = 1004)

In [7]:
print(X_train.shape[:],'\n', X_test.shape[:], '\n', Y_train.shape[:], '\n', Y_test.shape[:])  

(91046, 18) 
 (30349, 18) 
 (91046, 2) 
 (30349, 2)


#### Emsemble model - XGBoost
- 모델 사용 이유 : kaggle 등에서 성능이 좋아서 최근 많이 쓰임
- Gradient Boosting model 보다 계산 속도 빠름
- 참고 : https://lsjsj92.tistory.com/547

##### XGBoost Hyper Parameter Tuning
- Grid Search 방법으로 XGBoost Hyper Parameter Tuning함
- https://m.blog.naver.com/PostView.nhn?blogId=gustn3964&logNo=221431933811&proxyReferer=https:%2F%2Fwww.google.com%2F
- https://m.blog.naver.com/gustn3964/221431963819

In [28]:
# Model Building

xgb = XGBClassifier()

xgb_params_grid = {
    'n_estimators' : [100, 200, 300, 400, 500], # 결정 트리 개수
    'learning_rate' : [0.001, 0.01, 0.05, 0.10, 0.15, 0.2], # 학습률
    'max_depth' : [4, 6, 8, 10 ,12], # 트리 깊이
    'random_state' : [1004] # seed 설정
}

xgb_model = GridSearchCV(xgb, param_grid = xgb_params_grid, 
                        scoring = "accuracy", 
                        cv = KFold(5, random_state = 1004), # 5-fold CV 수행
                        n_jobs = -1, # 병렬 처리 개수 : -1은 전부
                        verbose = 1)



In [None]:
# Model Learning
Y_train1 = Y_train.loc[:, "DEATH"]
xgb_model.fit(X_train, Y_train1)

In [None]:
# 최적 parameter 확인
xgb_grid.best_params_
xgb_grid.best_score_