# Train데이터 불러오기

각 년도별 DataFrame을 리스트에 append합니다.

리스트 인덱스별로

0: 2017년도 데이터   

1: 2018년도 데이터 

2: 2019년도 데이터 

3: 202년도 데이터 

입니다

In [2]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.seed = 42
DATA_PATH = "../data_0112/"

warnings.filterwarnings(action='ignore')
PATH_2017 = DATA_PATH + "train/KNOW_2017.csv"
PATH_2018 = DATA_PATH + "train/KNOW_2018.csv"
PATH_2019 = DATA_PATH + "train/KNOW_2019.csv"
PATH_2020 = DATA_PATH + "train/KNOW_2020.csv"

paths = [PATH_2017, PATH_2018, PATH_2019, PATH_2020]

know_train = [pd.read_csv(path) for path in paths]

# delete useless feature manually
for i in range(4):
    know_train[i].drop(['상태', '이용', '것이므', '사용'], axis=1, inplace=True)

In [3]:
know_train[3]

Unnamed: 0,idx,saq1_1,saq1_2,saq2_1,saq2_2,saq3_1,saq3_2,saq4_1,saq4_2,saq5_1,...,분석,의료,한글,교재,시스템,회사원,노트북,생산,ubda_cnt,knowcode
0,27127,4,4,4,4,4,5,4,5,3,...,0,0,0,0,0,0,0,0,3,304003
1,27128,5,6,5,6,4,5,4,5,4,...,0,0,0,0,0,0,0,0,1,221401
2,27129,3,4,3,4,3,4,3,5,3,...,0,0,0,0,0,0,0,0,1,815001
3,27130,4,5,2,3,3,3,4,5,3,...,0,0,0,0,0,0,0,0,0,304003
4,27131,5,6,4,5,3,5,4,4,4,...,0,0,0,0,0,0,0,0,1,140201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8117,35244,4,6,3,5,3,3,1,0,5,...,0,0,0,0,0,0,0,0,2,121105
8118,35245,3,4,4,5,2,4,4,6,1,...,0,0,0,0,0,0,0,0,0,26102
8119,35246,5,6,3,4,3,4,4,5,5,...,0,0,0,0,0,0,0,0,4,31301
8120,35247,5,6,5,5,4,5,3,3,3,...,0,0,0,0,0,0,0,0,2,154105


# 전처리 

`data_0112`는 이미 전처리된 데이터

In [4]:
years = ['2017', '2018', '2019', '2020']
for i in range(4):
    print(f"{2017+i}: {know_train[i].shape}")

2017: (9486, 265)
2018: (9072, 250)
2019: (8555, 263)
2020: (8122, 298)


## 라벨 인코딩

숫자로 변환할 수 있는 컬럼은 라벨 인코딩을 사용하지 않았습니다.

string이나 object컬럼은 라벨인코더를 이용해 변환하였으며 추후 test셋에 사용해야하기 때문에 년도별, 컬럼별로 dictionary를 이용해 저장하였습니다

In [8]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'idx':
            print("HEHE")
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
HEHE
2018
HEHE
2019
HEHE
2020
HEHE


# X, y 구분 및 모델 학습

이번 대회에서 맞춰야 할 값은 knowcode입니다.

ID와 knowcode를 제외한 나머지 feature를 X, knowcode를 정답 y로 두어 모델을 학습하였습니다.

베이스라인에서는 의사결정나무와 랜덤포레스트를 선정하였습니다

df.drop(['idx','knowcode'], axis=1)

In [9]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.drop(['idx','knowcode'], axis=1),
                        'y': df['knowcode']} 

In [10]:
train_data['2019']

{'X':       sq1  sq2  sq3  sq4  sq5  sq6  sq7  sq8  sq9  sq10  ...  관리  분석  의료  한글  \
 0       4    4    4    3    4    4    4    4    4     4  ...   0   0   0   0   
 1       4    3    4    4    4    4    3    4    5     4  ...   0   0   0   0   
 2       2    3    2    2    2    2    2    2    3     3  ...   0   0   0   0   
 3       5    5    5    5    5    5    5    5    5     5  ...   0   0   0   0   
 4       1    4    4    1    1    2    4    3    4     4  ...   0   0   0   0   
 ...   ...  ...  ...  ...  ...  ...  ...  ...  ...   ...  ...  ..  ..  ..  ..   
 8550    3    4    4    3    4    3    3    3    3     3  ...   0   0   0   0   
 8551    5    5    4    4    4    5    4    4    5     5  ...   0   0   0   0   
 8552    3    3    4    4    4    4    4    3    3     4  ...   0   0   0   0   
 8553    3    3    3    1    4    4    3    2    3     3  ...   0   0   0   0   
 8554    5    5    5    5    5    3    3    4    4     3  ...   0   0   0   0   
 
       교재  시스템  회사원  

# Extra Trees Classifier

`n_estimators = 300`

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

et_models = {}

for year in tqdm(years):
    model = ExtraTreesClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=8)
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])
    et_models[year] = model

100%|██████████| 4/4 [00:42<00:00, 10.72s/it]


## Testset 불러오기

마찬가지로 년도별로 DataFrame으로 불러온 후 리스트에 할당합니다.


In [12]:
TEST_PATH_2017 = DATA_PATH + "test/KNOW_2017_test.csv"
TEST_PATH_2018 = DATA_PATH + "test/KNOW_2018_test.csv"
TEST_PATH_2019 = DATA_PATH + "test/KNOW_2019_test.csv"
TEST_PATH_2020 = DATA_PATH + "test/KNOW_2020_test.csv"

TEST_PATHs = [TEST_PATH_2017, TEST_PATH_2018, TEST_PATH_2019, TEST_PATH_2020]

In [13]:
know_test = [pd.read_csv(path) for path in TEST_PATHs]
# delete useless feature manually
for i in range(4):
    know_test[i].drop(['상태', '이용', '것이므', '사용'], axis=1, inplace=True)
know_test[0].head() # 2017년도 test 샘플

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,...,의료,한글,교재,시스템,회사원,이용,노트북,생산,것이므,ubda_cnt
0,0,3,4,2,2,3,3,1,0,3,...,0,0,0,0,0,0,0,0,0,4
1,1,5,5,3,5,5,5,5,5,4,...,0,0,0,1,0,0,0,0,0,3
2,2,5,5,5,4,5,4,1,0,1,...,0,0,0,0,0,0,0,0,0,2
3,3,4,5,5,6,4,6,3,4,4,...,0,0,0,0,0,0,0,0,0,4
4,4,5,6,4,5,4,5,1,0,1,...,0,0,0,0,0,0,0,0,0,1


In [14]:
# encode test data
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리
            

2017
2018
2019
2020


## 테스트셋 추출 및 학습

ID 값을 제외한 나머지 데이터를 이용하여 모델에 넣어 추론합니다.

In [15]:
test_data = {}
for year, df in zip(years, know_test):
    print(year)
    train_columns = train_data[year]['X'].columns
    test_data[year] =  {'X': df[train_columns]}

2017
2018
2019
2020


In [16]:
for year in years:
    print(year)
    print(f"train: {train_data[year]['X'].shape} test: {test_data[year]['X'].shape}")

2017
train: (9486, 263) test: (9486, 263)
2018
train: (9072, 248) test: (9069, 248)
2019
train: (8555, 261) test: (8554, 261)
2020
train: (8122, 296) test: (8122, 296)


### ET로 예측

In [17]:
et_predicts = [] 

for year in tqdm(years):
    pred = et_models[year].predict(test_data[year]['X'])
    et_predicts.extend(pred)

100%|██████████| 4/4 [00:28<00:00,  7.20s/it]


# 제출

In [18]:
submission = pd.read_csv('../data_0103/sample_submission.csv') # sample submission 불러오기

In [19]:
submission['knowcode'] = et_predicts

submission.to_csv('../submission_files/et_300_with_data_0112.csv', index=False)