In [1]:
!pip install hyperopt
!pip install imblearn
!pip install missingno
import missingno as msno
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import warnings
from IPython.display import Image

%config Completer.use_jedi = False
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

mpl.rc('font', family = 'D2coding')
mpl.rc('axes', unicode_minus = False)

sns.set(font = 'D2coding', rc = {'axes.unicode_minus':False}, style = 'darkgrid')
plt.rc('figure', figsize = (10, 8))

warnings.filterwarnings('ignore')



In [2]:
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve, classification_report

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Binarizer


# 평가지표 분류
def get_clf_eval(y_test, pred = None, pred_proba = None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    print('정확도 : {:.4f}, 정밀도 : {:.4f}, 재현율 : {:.4f}, F1 : {:.4f}, AUC : {:.4f}'.
          format(accuracy, precision, recall, f1, roc_auc))

# 재현율과 정밀도의 시각화
def precision_recall_curve_plot(y_test, pred_proba_c1):
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)
    
    plt.figure(figsize = (10, 7))
    plt.rc('font', family = 'D2coding')
    threshold_boundary = thresholds.shape[0]
    a = thresholds[precisions[0:threshold_boundary] == recalls[0:threshold_boundary]]
    loc = np.where(thresholds == a[0])[0][0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle = '--', label = 'precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label = 'recall')
    plt.scatter(thresholds[loc], precisions[loc], c = 'black', s = 60)
    plt.text(0.02 + thresholds[loc], precisions[loc], '임계값 : {:.3f}'.format(thresholds[loc]), size = 15)
    
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2), size = 15)
    plt.xlabel('Threshold value', size = 15); plt.ylabel('Precision and Recall value', size = 15)
    plt.legend()
    plt.rc('legend', fontsize = 15)
    plt.rc('ytick', labelsize = 15)
    plt.show()
    

# 임계값 조정 함수
def get_clf_eval_thres(y_test, pred_po, threshold, f1_show=False, auc_show=False):
    eval_df = pd.DataFrame()
    
    for thres in threshold:
        # threshold에 따른 예측 분류 값
        binarizer = Binarizer(threshold = thres)
        binarizer.fit(pred_po)
        thres_pred = binarizer.transform(pred_po)
        
        # 평가지표  
        accuracy = accuracy_score(y_test, thres_pred)
        precision = precision_score(y_test, thres_pred)
        recall = recall_score(y_test, thres_pred)
        f1 = f1_score(y_test, thres_pred)
        auc = roc_auc_score(y_test, pred_po)
        
        # 데이터 프레임 형태
        eval_lst = np.array([accuracy, precision, recall, f1, auc]).reshape(-1,1)
        temp = pd.DataFrame(eval_lst, columns=[thres], 
                            index = ["정확도", "정밀도", "재현율", "F1스코어", "AUC"])
        eval_df = pd.concat([eval_df,temp], axis=1)
        
    eval_df.columns.names = ["임계값"]
    
    if f1_show == False:
        eval_df.drop("F1스코어", axis=0, inplace=True)
        
    if auc_show == False:
        eval_df.drop("AUC", axis=0, inplace=True)
        
    return round(eval_df, 4)

In [3]:
sample = pd.read_csv('C:/k_digital/source/data/open/sample_submission.csv')
test = pd.read_csv('C:/k_digital/source/data/open/test.csv')
train = pd.read_csv('C:/k_digital/source/data/open/train.csv')
oil_info = pd.read_csv('C:/k_digital/source/data/open/data_info.csv')

In [6]:
train  = train[['ID', 'COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 
   'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 
   'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN', 'Y_LABEL']]
train

Unnamed: 0,ID,COMPONENT_ARBITRARY,ANONYMOUS_1,YEAR,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN,Y_LABEL
0,TRAIN_00000,2,1486,4,200,0,0,13,78,888,0.0,16,1,6,8504,5,0,154.0,75,0
1,TRAIN_00001,1,1350,14,375,0,0,0,31,2,0.0,0,0,0,19,0,0,44.0,652,0
2,TRAIN_00002,1,2415,8,200,0,0,1,2,4,0.0,0,0,0,17,0,0,72.6,412,1
3,TRAIN_00003,2,7389,3,200,0,0,0,1,37,0.0,1,0,0,44,0,0,133.3,7,0
4,TRAIN_00004,2,3954,8,200,0,0,0,0,71,0.0,0,0,0,217,0,0,133.1,128,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,TRAIN_14090,2,1616,7,200,0,0,0,3,23,0.0,0,0,0,35,0,0,135.4,16,0
14091,TRAIN_14091,0,2784,6,200,0,0,0,2,12,0.0,0,224,0,9,0,0,117.5,1408,0
14092,TRAIN_14092,2,1788,1,550,0,0,4,7,415,0.0,7,10,1,645,0,0,54.0,1301,0
14093,TRAIN_14093,1,2498,2,550,0,0,0,170,19,0.0,0,1,0,11,0,0,44.3,652,0


In [7]:
categorical_features = ['COMPONENT_ARBITRARY', 'YEAR']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in categorical_features:    
    train[col] = le.fit_transform(train[col])
    
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
scaled = ss.fit_transform(train.drop(['ID','COMPONENT_ARBITRARY', 'YEAR', 'Y_LABEL'], axis = 1))
columns = ['ANONYMOUS_1', 'ANONYMOUS_2', 
   'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 
   'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']
scaled = pd.DataFrame(scaled, columns = columns)
scaled.head()

onehot = train[['ID','COMPONENT_ARBITRARY', 'YEAR']]
scaled = pd.concat([onehot, scaled], axis = 1).set_index('ID')
scaled

Unnamed: 0_level_0,COMPONENT_ARBITRARY,YEAR,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TRAIN_00000,2,4,-0.393763,-0.340760,-0.150214,-0.089633,0.339245,0.336858,1.331290,-0.041588,1.186914,-0.384284,1.384414,5.293270,0.622282,-0.10655,0.899892,-0.966002
TRAIN_00001,1,14,-0.426022,-0.022576,-0.150214,-0.089633,-0.115388,-0.027612,-0.330406,-0.041588,-0.250456,-0.400998,-0.191804,-0.259244,-0.102635,-0.10655,-1.317376,0.119147
TRAIN_00002,1,8,-0.173409,-0.340760,-0.150214,-0.089633,-0.080416,-0.252497,-0.326655,-0.041588,-0.250456,-0.400998,-0.191804,-0.260552,-0.102635,-0.10655,-0.740886,-0.332215
TRAIN_00003,2,3,1.006399,-0.340760,-0.150214,-0.089633,-0.115388,-0.260252,-0.264764,-0.041588,-0.160621,-0.400998,-0.191804,-0.242884,-0.102635,-0.10655,0.482642,-1.093888
TRAIN_00004,2,8,0.191634,-0.340760,-0.150214,-0.089633,-0.115388,-0.268007,-0.200996,-0.041588,-0.250456,-0.400998,-0.191804,-0.129674,-0.102635,-0.10655,0.478611,-0.866326
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRAIN_14090,2,7,-0.362928,-0.340760,-0.150214,-0.089633,-0.115388,-0.244743,-0.291021,-0.041588,-0.250456,-0.400998,-0.191804,-0.248773,-0.102635,-0.10655,0.524972,-1.076961
TRAIN_14091,0,6,-0.085884,-0.340760,-0.150214,-0.089633,-0.115388,-0.252497,-0.311651,-0.041588,-0.250456,3.342831,-0.191804,-0.265787,-0.102635,-0.10655,0.164162,1.540935
TRAIN_14092,2,1,-0.322130,0.295608,-0.150214,-0.089633,0.024499,-0.213724,0.444177,-0.041588,0.378393,-0.233862,0.070899,0.150406,-0.102635,-0.10655,-1.115806,1.339703
TRAIN_14093,1,2,-0.153722,0.295608,-0.150214,-0.089633,-0.115388,1.050289,-0.298523,-0.041588,-0.250456,-0.384284,-0.191804,-0.264479,-0.102635,-0.10655,-1.311328,0.119147


## train, test

In [8]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(scaled, train['Y_LABEL'], test_size = 0.2, random_state = 2022, stratify = train['Y_LABEL'])

In [9]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)
print("SMOTE 적용 전 학습용 피처/레이블 데이터 세트 : ", X_train.shape, y_train.shape)
print('SMOTE 적용 후 학습용 피처/레이블 데이터 세트 :', X_train_over.shape, y_train_over.shape)
print('SMOTE 적용 후 값의 분포 :\n',pd.Series(y_train_over).value_counts())

SMOTE 적용 전 학습용 피처/레이블 데이터 세트 :  (11276, 18) (11276,)
SMOTE 적용 후 학습용 피처/레이블 데이터 세트 : (20628, 18) (20628,)
SMOTE 적용 후 값의 분포 :
 0    10314
1    10314
Name: Y_LABEL, dtype: int64


In [15]:
X_train

Unnamed: 0_level_0,COMPONENT_ARBITRARY,YEAR,ANONYMOUS_1,ANONYMOUS_2,AG,CO,CR,CU,FE,H2O,MN,MO,NI,PQINDEX,TI,V,V40,ZN
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TRAIN_06576,0,10,-0.338734,-0.340760,-0.150214,-0.089633,-0.010473,-0.252497,-0.287270,-0.041588,-0.250456,-0.400998,-0.191804,-0.262516,-0.102635,-0.10655,-0.243009,1.621804
TRAIN_05159,2,8,-0.278487,-0.340760,-0.150214,-0.089633,0.024499,-0.205969,-0.020948,-0.041588,0.198722,-0.400998,0.070899,0.507049,-0.102635,-0.10655,0.383873,-1.052513
TRAIN_06330,2,2,-0.464922,-0.002576,-0.150214,-0.089633,-0.045445,-0.244743,-0.180366,-0.041588,-0.160621,-0.350857,-0.191804,-0.245501,-0.102635,-0.10655,-0.988817,1.270119
TRAIN_06953,1,1,-0.288449,-0.340760,-0.150214,-0.089633,-0.080416,0.546235,-0.306025,-0.041588,0.019051,-0.384284,-0.191804,-0.261207,-0.102635,-0.10655,-1.539102,-0.871968
TRAIN_00428,0,2,-0.327349,-0.340760,-0.150214,-0.089633,-0.115388,-0.252497,-0.304149,-0.041588,-0.160621,3.209123,-0.191804,-0.261207,-0.102635,-0.10655,0.458454,1.177966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRAIN_01120,2,7,-0.250260,-0.340760,-0.150214,-0.089633,-0.010473,-0.198214,0.080329,-0.041588,0.019051,-0.317430,-0.191804,-0.197731,-0.102635,-0.10655,0.645914,-0.676378
TRAIN_12553,1,8,-0.176967,-0.340760,-0.150214,-0.089633,-0.115388,-0.182705,-0.319153,-0.041588,-0.250456,-0.400998,-0.191804,-0.267096,-0.102635,-0.10655,-1.301250,0.186851
TRAIN_01790,2,6,-0.323316,-0.340760,-0.150214,-0.089633,0.059471,-0.260252,0.421671,-0.041588,0.108886,-0.400998,0.070899,0.097400,0.477299,-0.10655,0.494737,-1.052513
TRAIN_07443,1,4,-0.418432,-0.340760,-0.150214,-0.089633,-0.115388,1.050289,-0.328531,-0.041588,-0.250456,-0.400998,-0.191804,-0.262516,-0.102635,-0.10655,-1.327454,0.051442


In [10]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# 개별 ML 모델을 위한 Classifier 생성.
knn_clf  = KNeighborsClassifier(n_neighbors=4)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=0)
dt_clf = DecisionTreeClassifier()
ada_clf = AdaBoostClassifier(n_estimators=100)

# 최종 Stacking 모델을 위한 Classifier생성. 
lr_final = LogisticRegression(C=10)

# 개별 모델들을 학습. 
knn_clf.fit(X_train, y_train)
rf_clf.fit(X_train , y_train)
dt_clf.fit(X_train , y_train)
ada_clf.fit(X_train, y_train)

# 학습된 개별 모델들이 각자 반환하는 예측 데이터 셋을 생성하고 개별 모델의 정확도 측정. 
knn_pred = knn_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
ada_pred = ada_clf.predict(X_test)

print('KNN 정확도: {0:.4f}'.format(accuracy_score(y_test, knn_pred)))
print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))
print('결정 트리 정확도: {0:.4f}'.format(accuracy_score(y_test, dt_pred)))
print('에이다부스트 정확도: {0:.4f}'.format(accuracy_score(y_test, ada_pred)))
print('KNN macro macro f1 score : ', f1_score(y_test, knn_pred, average = 'macro'))
print('랜덤 포레스트  macro f1 score : ', f1_score(y_test, rf_pred, average = 'macro'))
print('결정 트리 macro f1 score : ', f1_score(y_test, dt_pred, average = 'macro'))
print('에이다부스트 macro f1 score : ', f1_score(y_test, ada_pred, average = 'macro'))

KNN 정확도: 0.9117
랜덤 포레스트 정확도: 0.9149
결정 트리 정확도: 0.8531
에이다부스트 정확도: 0.9138
KNN macro macro f1 score :  0.4924272389449258
랜덤 포레스트  macro f1 score :  0.4938655814231851
결정 트리 macro f1 score :  0.5303920180010236
에이다부스트 macro f1 score :  0.48155242812867777


In [11]:
pred = np.array([knn_pred, rf_pred, dt_pred, ada_pred])

# transpose를 이용해 행과 열의 위치 교환. 컬럼 레벨로 각 알고리즘의 예측 결과를 피처로 만듦. 
pred = np.transpose(pred)

lr_final.fit(pred, y_test)
final = lr_final.predict(pred)
print('최종 메타 모델의 예측 정확도: {0:.4f}'.format(accuracy_score(y_test , final)))

최종 메타 모델의 예측 정확도: 0.9145


In [57]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import re

# 개별 기반 모델에서 최종 메타 모델이 사용할 학습 및 테스트용 데이터를 생성하기 위한 함수. 
def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds ):
    # 지정된 n_folds값으로 KFold 생성.
    kf = KFold(n_splits=n_folds, shuffle=False)
    #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화 
    train_fold_pred = np.zeros((X_train_n.shape[0] ,1 ))
    test_pred = np.zeros((X_test_n.shape[0],n_folds))

    for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 

        X_tr = X_train_n[train_index] 
        y_tr = y_train_n[train_index] 
        X_te = X_train_n[valid_index]  

        #폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행.
        model.fit(X_tr , y_tr)       
        #폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장.
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1)
        #입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장. 
        test_pred[:, folder_counter] = model.predict(X_test_n)
   
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성 
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    

    #train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터
    return train_fold_pred , test_pred_mean

knn_train, knn_test = get_stacking_base_datasets(knn_clf, X_train, y_train, X_test, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, y_train, X_test, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, y_train, X_test,  7)    
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, y_train, X_test, 7)

Stack_final_X_train = np.concatenate((knn_train, rf_train, dt_train, ada_train), axis=1)
Stack_final_X_test = np.concatenate((knn_test, rf_test, dt_test, ada_test), axis=1)
print('원본 학습 피처 데이터 Shape:',X_train.shape, '원본 테스트 피처 Shape:',X_test.shape)
print('스태킹 학습 피처 데이터 Shape:', Stack_final_X_train.shape,
      '스태킹 테스트 피처 데이터 Shape:',Stack_final_X_test.shape)

KeyError: "None of [Int64Index([ 1611,  1612,  1613,  1614,  1615,  1616,  1617,  1618,  1619,\n             1620,\n            ...\n            11266, 11267, 11268, 11269, 11270, 11271, 11272, 11273, 11274,\n            11275],\n           dtype='int64', length=9665)] are in the [columns]"

In [68]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=7, shuffle=False)
train_index, valid_index in kf.split(X_train)

train_index

array([   0,    1,    2, ..., 9663, 9664, 9665])

In [20]:
train_index, valid_index in kf.split(X_train)

(array([ 1611,  1612,  1613, ..., 11273, 11274, 11275]), False)

In [47]:
a = []
for i in train_index:
    a.append(i)
a = list(map(str, a))
a
# a = np.char.lstrip(np.array(list(map(str,a))))
# a

['1611',
 '1612',
 '1613',
 '1614',
 '1615',
 '1616',
 '1617',
 '1618',
 '1619',
 '1620',
 '1621',
 '1622',
 '1623',
 '1624',
 '1625',
 '1626',
 '1627',
 '1628',
 '1629',
 '1630',
 '1631',
 '1632',
 '1633',
 '1634',
 '1635',
 '1636',
 '1637',
 '1638',
 '1639',
 '1640',
 '1641',
 '1642',
 '1643',
 '1644',
 '1645',
 '1646',
 '1647',
 '1648',
 '1649',
 '1650',
 '1651',
 '1652',
 '1653',
 '1654',
 '1655',
 '1656',
 '1657',
 '1658',
 '1659',
 '1660',
 '1661',
 '1662',
 '1663',
 '1664',
 '1665',
 '1666',
 '1667',
 '1668',
 '1669',
 '1670',
 '1671',
 '1672',
 '1673',
 '1674',
 '1675',
 '1676',
 '1677',
 '1678',
 '1679',
 '1680',
 '1681',
 '1682',
 '1683',
 '1684',
 '1685',
 '1686',
 '1687',
 '1688',
 '1689',
 '1690',
 '1691',
 '1692',
 '1693',
 '1694',
 '1695',
 '1696',
 '1697',
 '1698',
 '1699',
 '1700',
 '1701',
 '1702',
 '1703',
 '1704',
 '1705',
 '1706',
 '1707',
 '1708',
 '1709',
 '1710',
 '1711',
 '1712',
 '1713',
 '1714',
 '1715',
 '1716',
 '1717',
 '1718',
 '1719',
 '1720',
 '1721',
 

In [23]:
X_train[train_index]

KeyError: "None of [Int64Index([ 1611,  1612,  1613,  1614,  1615,  1616,  1617,  1618,  1619,\n             1620,\n            ...\n            11266, 11267, 11268, 11269, 11270, 11271, 11272, 11273, 11274,\n            11275],\n           dtype='int64', length=9665)] are in the [columns]"

In [22]:
valid_index

array([   0,    1,    2, ..., 1608, 1609, 1610])

In [None]:
for folder_counter , (train_index, valid_index) in enumerate(kf.split(X_train)):
    #입력된 학습 데이터에서 기반 모델이 학습/예측할 폴드 데이터 셋 추출 
    X_tr = X_train[train_index] 
    y_tr = y_train[train_index] 
    X_te = X_train[valid_index]  