# Train & Test 데이터 불러오기

각 년도별 DataFrame을 리스트에 append합니다.

리스트 인덱스별로

0: 2017년도 데이터   

1: 2018년도 데이터 

2: 2019년도 데이터 

3: 202년도 데이터 

입니다

In [1]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.seed = 42
DATA_PATH = "../data_0119/"

warnings.filterwarnings(action='ignore')
PATH_2017 = DATA_PATH + "train/KNOW_2017.csv"
PATH_2018 = DATA_PATH + "train/KNOW_2018.csv"
PATH_2019 = DATA_PATH + "train/KNOW_2019.csv"
PATH_2020 = DATA_PATH + "train/KNOW_2020.csv"

paths = [PATH_2017, PATH_2018, PATH_2019, PATH_2020]
know_train = [pd.read_csv(path) for path in paths]

TEST_PATH_2017 = DATA_PATH + "test/KNOW_2017_test.csv"
TEST_PATH_2018 = DATA_PATH + "test/KNOW_2018_test.csv"
TEST_PATH_2019 = DATA_PATH + "test/KNOW_2019_test.csv"
TEST_PATH_2020 = DATA_PATH + "test/KNOW_2020_test.csv"

TEST_PATHs = [TEST_PATH_2017, TEST_PATH_2018, TEST_PATH_2019, TEST_PATH_2020]
know_test = [pd.read_csv(path) for path in TEST_PATHs]

In [6]:
skip_cols_2017 = ['aq1_2', 'aq2_2', 'aq3_2', 'aq4_2', 'aq5_2', 'aq6_2', 'aq7_2', 'aq8_2', 'aq9_2', 'aq10_2'
                ,'aq11_2', 'aq12_2', 'aq13_2', 'aq14_2', 'aq15_2', 'aq16_2', 'aq17_2', 'aq18_2', 'aq19_2', 'aq20_2'
                ,'aq21_2', 'aq22_2', 'aq23_2', 'aq24_2', 'aq25_2', 'aq26_2', 'aq27_2', 'aq28_2', 'aq29_2', 'aq30_2'
                ,'aq31_2', 'aq32_2', 'aq33_2', 'aq34_2', 'aq35_2', 'aq36_2', 'aq37_2', 'aq38_2', 'aq39_2', 'aq40_2'
                ,'aq41_2']  
skip_cols_2018 = []  
skip_cols_2019 = ['kq1_2', 'kq2_2', 'kq3_2', 'kq4_2', 'kq5_2', 'kq6_2', 'kq7_2', 'kq8_2', 'kq9_2', 'kq10_2'
            ,'kq11_2', 'kq12_2', 'kq13_2', 'kq14_2', 'kq15_2','kq16_2', 'kq17_2', 'kq18_2', 'kq19_2','kq20_2'
            ,'kq21_2', 'kq22_2', 'kq23_2', 'kq24_2', 'kq25_2','kq26_2', 'kq27_2','kq28_2', 'kq29_2', 'kq30_2'
            ,'kq31_2', 'kq32_2','kq33_2']
skip_cols_2020 = ['saq1_2', 'saq2_2', 'saq3_2', 'saq4_2', 'saq5_2','saq6_2', 'saq7_2', 'saq8_2', 'saq9_2', 'saq10_2'
                ,'saq11_2', 'saq12_2', 'saq13_2', 'saq14_2','saq15_2', 'saq16_2', 'saq17_2', 'saq18_2', 'saq19_2'
                ,'saq20_2', 'saq21_2', 'saq22_2', 'saq23_2', 'saq24_2', 'saq25_2', 'saq26_2', 'saq27_2', 'saq28_2', 'saq29_2'
                ,'saq30_2', 'saq31_2', 'saq32_2', 'saq33_2', 'saq34_2', 'saq35_2','saq36_2', 'saq37_2', 'saq38_2' 
                , 'saq39_2', 'saq40_2', 'saq41_2', 'saq42_2',  'saq43_2', 'saq44_2' 
                ]

skip_cols_list =[skip_cols_2017, skip_cols_2018, skip_cols_2019, skip_cols_2020]

for i, skip_cols in enumerate(skip_cols_list):
    know_train[i] = know_train[i].drop(skip_cols,axis=1)
    know_test[i] = know_test[i].drop(skip_cols,axis=1)

# 전처리 

`data_0119`는 이미 전처리된 데이터

In [7]:
years = ['2017', '2018', '2019', '2020']
for i in range(4):
    print(f"{2017+i}: {know_train[i].shape}")

2017: (9486, 230)
2018: (9072, 241)
2019: (8555, 218)
2020: (8122, 246)


## 라벨 인코딩

숫자로 변환할 수 있는 컬럼은 라벨 인코딩을 사용하지 않았습니다.

string이나 object컬럼은 라벨인코더를 이용해 변환하였으며 추후 test셋에 사용해야하기 때문에 년도별, 컬럼별로 dictionary를 이용해 저장하였습니다

In [8]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'idx':
            print("HEHE")
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
HEHE
2018
HEHE
2019
HEHE
2020
HEHE


In [9]:
# encode test data
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리
            

2017
2018
2019
2020


# X, y 구분 및 모델 학습

이번 대회에서 맞춰야 할 값은 knowcode입니다.

ID와 knowcode를 제외한 나머지 feature를 X, knowcode를 정답 y로 두어 모델을 학습하였습니다.

베이스라인에서는 의사결정나무와 랜덤포레스트를 선정하였습니다

df.drop(['idx','knowcode', 'description'], axis=1)

In [10]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.drop(['idx','knowcode','description'], axis=1),
                        'y': df['knowcode']}

test_data = {}
for year, df in zip(years, know_test):
    print(year)
    train_columns = train_data[year]['X'].columns
    test_data[year] =  {'X': df[train_columns]} 

2017
2018
2019
2020


In [11]:
for year in years:
    print(year)
    print(f"train: {train_data[year]['X'].shape} test: {test_data[year]['X'].shape}")

2017
train: (9486, 227) test: (9486, 227)
2018
train: (9072, 238) test: (9069, 238)
2019
train: (8555, 215) test: (8554, 215)
2020
train: (8122, 243) test: (8122, 243)


# Extra Trees Classifier

`n_estimators = 300`

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

et_models = {}
et_predicts = [] 

for year in tqdm(years):
    # train
    if year =='2017':
        model = ExtraTreesClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=8)
    else:
        model = ExtraTreesClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=8)
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])

    # predict
    pred = model.predict(test_data[year]['X'])
    et_predicts.extend(pred)

100%|██████████| 4/4 [04:12<00:00, 63.18s/it]


# 제출

In [13]:
submission = pd.read_csv('../data_0103/sample_submission.csv') # sample submission 불러오기

In [14]:
submission['knowcode'] = et_predicts

submission.to_csv('../submission_files/et_400_with_data_0119_dropcols.csv', index=False)