In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from datetime import datetime, timedelta
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
import sklearn.svm as svm
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, accuracy_score, auc

# Data Modeling

## 데이터 수집

In [2]:
df = pd.read_csv('d_final.csv', encoding = 'utf-8')

In [3]:
df.head()

Unnamed: 0,id,sex,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,...,tobacco,contact_other_covid,covid_res,icu,DEATH,d_ent,d_sym,d_last,diff_days,age_grp
0,167386,1,2,2,54,2,2,2,2,2,...,2,3,1,2,0,2020-04-06,2020-04-01,2020-06-29,89,3.0
1,0b5948,2,2,1,30,2,2,2,2,2,...,2,3,1,2,0,2020-04-17,2020-04-10,2020-06-29,80,2.0
2,0d01b5,1,2,2,60,2,1,2,2,2,...,2,3,1,2,1,2020-04-13,2020-04-13,2020-04-22,9,4.0
3,1beec8,2,2,1,47,2,1,2,2,2,...,2,3,1,1,1,2020-04-16,2020-04-16,2020-04-29,13,3.0
4,1.75E+56,2,2,2,63,2,2,2,2,2,...,2,3,1,2,0,2020-04-22,2020-04-13,2020-06-29,77,4.0


In [4]:
df.describe()

Unnamed: 0,sex,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,hypertension,...,obesity,cardiovascular,renal_chronic,tobacco,contact_other_covid,covid_res,icu,DEATH,diff_days,age_grp
count,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,...,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0,121395.0
mean,1.599374,1.919198,1.39495,52.957577,1.992561,1.71456,1.959776,1.979027,1.96677,1.674105,...,1.799275,1.953655,1.945583,1.917081,2.367676,1.557519,1.917987,0.266914,32.845092,3.18738
std,0.490027,0.276166,0.488859,19.237785,0.100507,0.463632,0.219534,0.173651,0.208078,0.479277,...,0.413378,0.234145,0.247939,0.294203,0.718388,0.694507,0.278025,0.442349,27.360004,0.970514
min,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
25%,1.0,2.0,1.0,42.0,2.0,1.0,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,11.0,3.0
50%,2.0,2.0,1.0,55.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,3.0,1.0,2.0,0.0,24.0,3.0
75%,2.0,2.0,2.0,66.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,3.0,2.0,2.0,1.0,49.0,4.0
max,2.0,3.0,3.0,118.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,180.0,5.0


## 변수 설정

In [5]:
X = df.loc[:, ['sex', 'intubed','age_grp','pneumonia','pregnancy', 
               'diabetes', 'copd', 'asthma','inmsupr','hypertension',
               'other_disease','cardiovascular','obesity', 'renal_chronic', 
               'tobacco', 'contact_other_covid', 'covid_res', 'icu']]

Y = df.loc[:, ['DEATH', 'diff_days']]

## 정규화

In [6]:
t_train_raw = MinMaxScaler(feature_range=(0,1)).fit_transform(X)

In [7]:
t_train_raw

array([[0.  , 0.5 , 0.5 , ..., 1.  , 0.  , 0.5 ],
       [1.  , 0.5 , 0.25, ..., 1.  , 0.  , 0.5 ],
       [0.  , 0.5 , 0.75, ..., 1.  , 0.  , 0.5 ],
       ...,
       [1.  , 0.  , 0.5 , ..., 0.5 , 1.  , 0.  ],
       [0.  , 0.5 , 0.25, ..., 1.  , 1.  , 0.5 ],
       [0.  , 0.5 , 0.5 , ..., 1.  , 1.  , 0.5 ]])

##### train/test 데이터 셋 분리

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True, random_state = 1004)

In [9]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(97116, 18) (24279, 18) (97116, 2) (24279, 2)


In [10]:
Y_train2 = y_train.loc[:, "DEATH"]

## Support Vector Machine

#####  SVM Hyper Parameter Tuning <br>

- 선형/비선형 커널 중 어떤것이 나을지는 3-fold CV로 결정
- GridSerchCV로 파라미터 결정
- 코드참고 https://blog.naver.com/PostView.nhn?blogId=winddori2002&logNo=221667083964

#### 1) Linear, Non-linear kernel (3-fold CV)

In [11]:
"""
# Linear kernel

Y_train2 = y_train.loc[:, "DEATH"]
SVM =svm.SVC(kernel = 'linear')     # 선형 분리 커널 
"""

'\n# Linear kernel\n\nY_train2 = y_train.loc[:, "DEATH"]\nSVM =svm.SVC(kernel = \'linear\')     # 선형 분리 커널 \n'

In [12]:
"""
# CV1 (linear)

scores = cross_val_score(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True))
scores 
"""

'\n# CV1 (linear)\n\nscores = cross_val_score(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True))\nscores \n'

In [13]:
"""
pd.DataFrame(cross_validate(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True)))
print('교차검증 평균: ', scores.mean())
"""

"\npd.DataFrame(cross_validate(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True)))\nprint('교차검증 평균: ', scores.mean())\n"

In [14]:
"""
# Non-Linear Kernel

SVM =svm.SVC(kernel = 'rbf')
"""

"\n# Non-Linear Kernel\n\nSVM =svm.SVC(kernel = 'rbf')\n"

In [15]:
"""
# CV2 (NonLinear)

scores = cross_val_score(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True))
scores
"""

'\n# CV2 (NonLinear)\n\nscores = cross_val_score(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True))\nscores\n'

In [16]:
"""
pd.DataFrame(cross_validate(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True)))
print('교차검증 평균: ', scores.mean())
"""

"\npd.DataFrame(cross_validate(SVM, x_train, Y_train2, cv = KFold(3, random_state=1004, shuffle=True)))\nprint('교차검증 평균: ', scores.mean())\n"

#### 2) Hyperparameter Tuning (GridSerchCV) -- Non-Linear

In [17]:
svm_model = svm.SVC(kernel = 'rbf', probability=True, random_state=1004)
parameters = {'C' : [0.01, 0.1, 1, 10, 25, 50, 100],
              'gamma' : [0.01, 0.1, 1, 10, 25, 50, 100]}

In [18]:
grid_svm = GridSearchCV(svm_model, 
                        param_grid = parameters,
                        cv = KFold(3, random_state = 1004, shuffle=True),
                        n_jobs = 4)

In [None]:
%%time
grid_svm.fit(x_train, Y_train2)

##### 최적 parameter 확인

In [None]:
print("Best parameters (5-fold CV, gird search): " , grid_svm.best_params_)
print("Best accuracy (5-fold CV, gird search): " , grid_svm.best_score_)

In [None]:
cv_result_df = pd.DataFrame(grid_svm.cv_results_)
cv_result_df.sort_values(by=['rank_test_score'], inplace=True)

In [None]:
cv_result_df[['params', 'mean_test_score', 'rank_test_score']].head()

##### 최적 parameter로 SVM 모델 생성 및 예측

In [None]:
svm_best = grid_svm.best_estimator_
Y_pred = grid_svm.predict(x_test)
Y_pred

In [None]:
svm_best

##### Test set 성능 평가 <br>

- test set에서의 결과 평가

In [None]:
Y_test2 = y_test.loc[:, "DEATH"]

In [None]:
print('accuracy : %.3f' % accuracy_score(y_true = Y_test2, y_pred = Y_pred))
print('recall : %.3f' % recall_score(y_true = Y_test2, y_pred = Y_pred))
print('f1_score : %.3f' % f1_score(y_true = Y_test2, y_pred = Y_pred))

##### ROC curve 작성

In [None]:
f_value = svm_best.fit(x_test,Y_test2).predict_proba(x_test)     # 분류 확률
fpr, tpr, thresholds = roc_curve(Y_test2, f_value[:,1])
fpr, tpr, thresholds

In [None]:
plt.plot(fpr, tpr, 'o-', label="Xgboost (AUC=%0.3f)")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Classification - Support Vector Machine')
plt.show()

In [None]:
plt.plot(fpr, tpr, 'o-', label="SVM (AUC=%0.3f)")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Classification - Support Vector Machine')
plt.show()

In [None]:
print('AUC : %.3f' % auc(fpr, tpr))

##### TPR, FPR export (나중에 그래프 그릴 때 사용)

In [None]:
d_tpr = pd.DataFrame({'tpr':tpr} )
d_fpr = pd.DataFrame({'fpr':fpr} )

In [None]:
clf_svm = pd.concat([d_tpr, d_fpr], axis = 1)
clf_svm

In [None]:
clf_svm.to_csv('clf_svm.csv', encoding='utf-8', index=False, line_terminator='\n')

##### Feature importance plot

In [None]:
print("특성 중요도 : \n{}".format(svm_best.feature_importances_))

##### 특성 중요도 시각화

In [None]:
def plot_feature_importances_df(model):
    n_features = x_test.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), ['sex', 'intubed', 'pneumonia', 'age_grp', 'pregnancy', 'diabetes',
                                       'copd', 'asthma', 'inmsupr', 'hypertension', 'other_disease', 'obesity',
                                       'cardiovascular', 'renal_chronic', 'tobacco', 'contact_other_covid', 'covid_res', 'icu'])
    plt.xlabel("Importances")
    plt.ylabel("Feature")
    plt.title("Classification - Support Vector Machine")
    plt.ylim(-1, n_features)
    
plt.show()

plot_feature_importances_df(svm_best)