In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime, timedelta
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
import sklearn.svm as svm
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, accuracy_score, auc

### Data Modeling

#### Normalization : MinMaxScaler

In [2]:
d_final = pd.read_csv('d_final.csv', encoding = 'utf-8')

In [3]:
d_final.head()

Unnamed: 0,id,sex,intubed,pneumonia,age,pregnancy,diabetes,copd,asthma,inmsupr,...,tobacco,contact_other_covid,covid_res,icu,DEATH,d_ent,d_sym,d_last,diff_days,age_grp
0,167386,1,2,2,54,2,2,2,2,2,...,2,3,1,2,0,2020-04-06,2020-04-01,2020-06-29,89,3.0
1,0b5948,2,2,1,30,2,2,2,2,2,...,2,3,1,2,0,2020-04-17,2020-04-10,2020-06-29,80,2.0
2,0d01b5,1,2,2,60,2,1,2,2,2,...,2,3,1,2,1,2020-04-13,2020-04-13,2020-04-22,9,4.0
3,1beec8,2,2,1,47,2,1,2,2,2,...,2,3,1,1,1,2020-04-16,2020-04-16,2020-04-29,13,3.0
4,1.75E+56,2,2,2,63,2,2,2,2,2,...,2,3,1,2,0,2020-04-22,2020-04-13,2020-06-29,77,4.0


In [4]:
# 정규화할 변수만 d_normal0에 저장
d_normal0 = d_final.loc[:, ['sex', 'intubed', 'pneumonia', 'age_grp', 'pregnancy', 'diabetes', 'copd', 'asthma', 'inmsupr',
                            'hypertension', 'other_disease', 'obesity', 'cardiovascular', 'renal_chronic', 
                            'tobacco', 'contact_other_covid', 'covid_res', 'icu']]

# y(종속변수)는 정규화 하지않고 따로 저장
d_y = d_final.loc[:, ['DEATH', 'diff_days']]

In [5]:
# 정규화
d_normal = MinMaxScaler(feature_range=(0,1)).fit_transform(d_normal0)
d_normal

array([[0.        , 0.01020408, 0.5       , ..., 1.        , 0.        ,
        0.5       ],
       [1.        , 0.01020408, 0.        , ..., 1.        , 0.        ,
        0.5       ],
       [0.        , 0.01020408, 0.5       , ..., 1.        , 0.        ,
        0.5       ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.5       , 1.        ,
        0.        ],
       [0.        , 0.01020408, 0.        , ..., 1.        , 1.        ,
        0.5       ],
       [0.        , 0.01020408, 0.        , ..., 1.        , 1.        ,
        0.5       ]])

#### Train, Test set 분리

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(d_normal, d_y, shuffle = True, test_size=0.2, random_state = 1004)

In [7]:
print(X_train.shape[:],'\n', X_test.shape[:], '\n', Y_train.shape[:], '\n', Y_test.shape[:])  

(97116, 18) 
 (24279, 18) 
 (97116, 2) 
 (24279, 2)


In [8]:
Y_train1 = Y_train.loc[:, "DEATH"]

### Support Vector Machine

#### SVM Hyper Parameter Tuning
- 선형/비선형 커널 중 어떤것이 나을지는 3-fold CV로 결정
- GridSerchCV로 파라미터 결정
- 코드참고 https://blog.naver.com/PostView.nhn?blogId=winddori2002&logNo=221667083964

##### 1) Linear, Non-linear kernel (3-fold CV)

In [9]:
"""# Linear kernel
Y_train1 = Y_train.loc[:, "DEATH"]
SVM =svm.SVC(kernel = 'linear') # 선형분리 커널 """

'# Linear kernel\nY_train1 = Y_train.loc[:, "DEATH"]\nSVM =svm.SVC(kernel = \'linear\') # 선형분리 커널 '

In [10]:
"""# CV1 (linear)
scores = cross_val_score(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True))
scores """

'# CV1 (linear)\nscores = cross_val_score(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True))\nscores '

In [11]:
"""pd.DataFrame(cross_validate(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True)))
print('교차검증 평균: ', scores.mean())"""

"pd.DataFrame(cross_validate(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True)))\nprint('교차검증 평균: ', scores.mean())"

In [12]:
"""# Non-Linear Kernel
SVM =svm.SVC(kernel = 'rbf')"""

"# Non-Linear Kernel\nSVM =svm.SVC(kernel = 'rbf')"

In [13]:
"""# CV2 (NonLinear)
scores = cross_val_score(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True))
scores """

'# CV2 (NonLinear)\nscores = cross_val_score(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True))\nscores '

In [14]:
"""pd.DataFrame(cross_validate(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True)))
print('교차검증 평균: ', scores.mean())0"""

"pd.DataFrame(cross_validate(SVM, X_train, Y_train1, cv = KFold(3, random_state=1004, shuffle=True)))\nprint('교차검증 평균: ', scores.mean())0"

##### 2) Hyperparameter Tuning (GridSerchCV) -- Non-Linear로 하기!

In [15]:
svm_model = svm.SVC(kernel = 'rbf', probability=True, random_state=1004)
parameters = {'C' : [0.01, 0.1, 1, 10, 25, 50, 100],
             'gamma' : [0.01, 0.1, 1, 10, 25, 50, 100]}

In [16]:
grid_svm = GridSearchCV(svm_model, 
                       param_grid = parameters,
                       cv = KFold(3, random_state = 1004, shuffle=True),
                       n_jobs = 4)

In [None]:
grid_svm.fit(X_train, Y_train1)

In [None]:
# 최적 parameter 확인
print("Best parameters (5-fold CV, gird search): " , grid_svm.best_params_)
print("Best accuracy (5-fold CV, gird search): " , grid_svm.best_score_)

In [None]:
cv_result_df = pd.DataFrame(grid_svm.cv_results_)
cv_result_df.sort_values(by=['rank_test_score'], inplace=True)

In [None]:
cv_result_df[['params', 'mean_test_score', 'rank_test_score']].head()

In [None]:
# 최적 parameter로 SVM 모델 생성 및 예측
svm_best = grid_svm.best_estimator_
Y_pred = grid_svm.predict(X_test) ; Y_pred # 예측값

In [None]:
svm_best

#### Test set 성능 평가
- test set에서의 결과 평가

In [None]:
Y_test1 = Y_test.loc[:, "DEATH"] # True 값 

In [None]:
print('accuracy : %.3f' % accuracy_score(y_true = Y_test1, y_pred=Y_pred))
print('recall : %.3f' % recall_score(y_true = Y_test1, y_pred=Y_pred))
print('f1_score : %.3f' % f1_score(y_true = Y_test1, y_pred=Y_pred))

#### ROC curve 작성

In [None]:
f_value = svm_best.fit(X_test,Y_test1).predict_proba(X_test) # 분류 확률
fpr, tpr, thresholds = roc_curve(Y_test1, f_value[:,1])
fpr, tpr, thresholds

In [None]:
plt.plot(fpr, tpr, 'o-', label="Xgboost (AUC=%0.3f)")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.xlabel('Talse positive rate')
plt.ylabel('True positive rate')
plt.title('Classification - Support Vector Machine')
plt.show()

In [None]:
plt.plot(fpr, tpr, 'o-', label="SVM (AUC=%0.3f)")
plt.plot([0, 1], [0, 1], 'k--', label="random guess")
plt.xlabel('Talse positive rate')
plt.ylabel('True positive rate')
plt.title('Classification - Support Vector Machine')
plt.show()


In [None]:
print('AUC : %.3f' % auc(fpr, tpr))

#### TPR, FPR export (나중에 그래프 그릴 때 쓰게)

In [None]:
d_tpr = pd.DataFrame({'tpr':tpr} )
d_fpr = pd.DataFrame({'fpr':fpr} )

In [None]:
clf_svm = pd.concat([d_tpr, d_fpr], axis = 1)
clf_svm

In [None]:
clf_svm.to_csv('C://workspaces//AI//Final_PJT//final pjt//clf_svm.csv', encoding='utf-8', index=False, line_terminator='\n')

#### Feature importance plot

In [None]:
print("특성 중요도 : \n{}".format(svm_best.feature_importances_))

In [None]:
# 특성 중요도 시각화 하기


def plot_feature_importances_df(model):
    n_features = X_test.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), ['sex', 'intubed', 'pneumonia', 'age_grp', 'pregnancy', 'diabetes',
                                       'copd', 'asthma', 'inmsupr', 'hypertension', 'other_disease', 'obesity',
                                       'cardiovascular', 'renal_chronic', 'tobacco', 'contact_other_covid', 'covid_res', 'icu'])
    plt.xlabel("Importances")
    plt.ylabel("Feature")
    plt.title("Classification - Support Vector Machine")
    plt.ylim(-1, n_features)
plt.show()

plot_feature_importances_df(svm_best)