In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rc  
rc('font', family='NanumGothic') 			
plt.rcParams['axes.unicode_minus'] = False 

from glob import glob
from tqdm import tqdm
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
RANDOM_STATE = 42
np.seed = 42
DATA_PATH = "../data_0103/"

PATH_2017 = DATA_PATH + "train/KNOW_2017.csv"
PATH_2018 = DATA_PATH + "train/KNOW_2018.csv"
PATH_2019 = DATA_PATH + "train/KNOW_2019.csv"
PATH_2020 = DATA_PATH + "train/KNOW_2020.csv"

paths = [PATH_2017, PATH_2018, PATH_2019, PATH_2020]

know_train = [pd.read_csv(path) for path in paths]

# 전처리
* baseline 모델과 동일하게 전처리
    - 공백 '0'으로 대체
    - object column은 라벨인코더를 이용해 변환
    - 년도별, 컬럼별로 dictionary를 이용해 저장

In [3]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'idx':
            print("HEY")
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
HEY
2018
HEY
2019
HEY
2020
HEY


# X, y 구분 및 모델 학습

* random forest를 이용해 feature_importance 측정

In [5]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # idx 제외
                        'y': df.iloc[:, -1]} 

In [6]:
train_data['2017']

{'X':       aq1_1  aq1_2  aq2_1  aq2_2  aq3_1  aq3_2  aq4_1  aq4_2  aq5_1  aq5_2  \
 0         3      3      3      3      3      3      4      4      3      4   
 1         4      5      4      5      3      4      3      4      3      4   
 2         3      4      3      4      3      4      5      6      4      5   
 3         3      3      3      3      3      5      4      5      4      6   
 4         4      5      3      4      3      4      4      5      3      4   
 ...     ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
 9481      3      5      2      4      3      3      2      2      2      3   
 9482      5      5      5      5      5      5      3      4      4      5   
 9483      3      3      4      6      3      3      4      5      4      5   
 9484      3      5      3      5      4      5      3      4      3      5   
 9485      3      4      3      4      3      4      3      4      3      4   
 
       ...  bq36  bq37  bq38  bq38_1  bq39_1 

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_models = {}

for year in tqdm(years):
    model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=8)
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])
    rf_models[year] = model

100%|██████████| 4/4 [00:11<00:00,  2.93s/it]


In [8]:
for i, year in enumerate(years):
    print(f"{year} top 10 ranking features")
    importance = rf_models[year].feature_importances_
    indices = np.argsort(importance)[::-1]

    features = know_train[i].columns[1:-1]
    for ix in range(10):
        print(f"{ix+1:2}. feature {features[indices][ix]:7} ({importance[indices][ix]:.4f})")
    
    print("------------------------------")

2017 top 10 ranking features
 1. feature bq1     (0.0213)
 2. feature bq38_1  (0.0185)
 3. feature bq31    (0.0171)
 4. feature bq41_3  (0.0162)
 5. feature bq4_1a  (0.0159)
 6. feature bq41_1  (0.0152)
 7. feature bq19_1  (0.0141)
 8. feature bq37    (0.0139)
 9. feature bq3     (0.0111)
10. feature bq2     (0.0103)
------------------------------
2018 top 10 ranking features
 1. feature bq1     (0.0266)
 2. feature bq37_1  (0.0202)
 3. feature bq4_1a  (0.0201)
 4. feature bq30    (0.0191)
 5. feature bq36    (0.0175)
 6. feature bq41_2  (0.0166)
 7. feature bq41_1  (0.0163)
 8. feature bq28_1  (0.0155)
 9. feature bq40    (0.0154)
10. feature bq3     (0.0131)
------------------------------
2019 top 10 ranking features
 1. feature bq1     (0.0235)
 2. feature bq4_1a  (0.0188)
 3. feature bq27_1  (0.0178)
 4. feature bq26    (0.0169)
 5. feature bq31_2  (0.0151)
 6. feature bq30    (0.0150)
 7. feature bq31_1  (0.0145)
 8. feature bq20_1  (0.0132)
 9. feature bq3     (0.0116)
10. featur