# String MulitiClassifier Model

1. 데이터  
    * data_0112를 사용함  
    * 데이터에 knowcode의 설명인 description열을 추가함(data_pdf_description_ver2 파일 참조)  
    * train set과 test set의 str열인 major와 description에 대해서 Stopwords들을 제거함  
    * idx열을 인덱스로 만듦   
# 
2. fitting  
    * STEP 1 : Major열을 기준으로 정합성을 테스트하고, 올바르지 못한것은 et로 예측할 것이므로 따로 빼두기 **(string_compare fitting)**   
    * STEP 2 : Major열과 description을 jaro_distance로 비교하여 유사도를 도출함 **(string_compare fitting)**  
    * STEP 3 : 전체 train 데이터를 기준으로 et_300으로 학습함 **(et fitting)**
#          
3. Prediction  
    * STEP 1 : Similarity와 사전에 정한 threshold를 비교하여 높은 레코드에 대해서만 string_compare로 예측을 진행 **(string_compare predict)**
    * STEP 2 : 그렇지 못한 데이터에 대해서는 et_300으로 예측값을 도출함 **(et predict)**

# 라이브러리 import & 데이터 로딩 & 전처리

In [1]:
# 데이터 가져오기 및 라이브러리 가져오기

import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings

RANDOM_STATE = 42
np.seed = 42
DATA_PATH = "../data_0112/"

warnings.filterwarnings(action='ignore')
PATH_2017 = DATA_PATH + "train/KNOW_2017.csv"
PATH_2018 = DATA_PATH + "train/KNOW_2018.csv"
PATH_2019 = DATA_PATH + "train/KNOW_2019.csv"
PATH_2020 = DATA_PATH + "train/KNOW_2020.csv"

paths = [PATH_2017, PATH_2018, PATH_2019, PATH_2020]

know_train = [pd.read_csv(path) for path in paths]

TEST_PATH_2017 = DATA_PATH + "test/KNOW_2017_test.csv"
TEST_PATH_2018 = DATA_PATH + "test/KNOW_2018_test.csv"
TEST_PATH_2019 = DATA_PATH + "test/KNOW_2019_test.csv"
TEST_PATH_2020 = DATA_PATH + "test/KNOW_2020_test.csv"

TEST_PATHs = [TEST_PATH_2017, TEST_PATH_2018, TEST_PATH_2019, TEST_PATH_2020]

know_test = [pd.read_csv(path) for path in TEST_PATHs]

years = ["2017", "2018", "2019", "2020"]

# 모델에 쓸 수 있게 간단한 전처리
text_info_cols = {"2017": ['sim_job','bef_job','able_job','major'],
                  "2018": ['sim_job','bef_job','able_job','major'],
                  "2019": ['bef_job','able_job','major'],
                  "2020": ['major'],}

description_2017 = pd.read_csv('../data_pdf_description_ver2/pdf_description_2017.csv')
description_2018 = pd.read_csv('../data_pdf_description_ver2/pdf_description_2018.csv')
description_2019 = pd.read_csv('../data_pdf_description_ver2/pdf_description_2019.csv')
description_2020 = pd.read_csv('../data_pdf_description_ver2/pdf_description_2020.csv')
description_dfs = {"2017": description_2017,
                  "2018": description_2018,
                  "2019": description_2019,
                  "2020": description_2020,}

# description 열을 추가해줘야 합니다
for i, year in enumerate(years):
    know_train[i] = pd.merge(know_train[i], description_dfs[year], on='knowcode',how='left').fillna('0')

text_info_cols = ['major','description']

# major와 description 열에 대한 전처리를 해줍시다
## train
for i, year in enumerate(years):
    know_train[i].drop(['상태', '이용', '것이므', '사용'], axis=1, inplace=True) # 쓸모없는 열 제거
    for text_info_col in text_info_cols:
        know_train[i].loc[know_train[i][text_info_col]=='없다', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='없음', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='0', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='무', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='모름', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='공란', text_info_col] = ''

## test(major만 가능)       
for i, year in enumerate(years):
    text_info_col = 'major'
    know_test[i].loc[know_test[i][text_info_col]=='없다', text_info_col] = ''
    know_test[i].loc[know_test[i][text_info_col]=='없음', text_info_col] = ''
    know_test[i].loc[know_test[i][text_info_col]=='0', text_info_col] = ''
    know_test[i].loc[know_test[i][text_info_col]=='무', text_info_col] = ''
    know_test[i].loc[know_test[i][text_info_col]=='모름', text_info_col] = ''
    know_test[i].loc[know_test[i][text_info_col]=='공란', text_info_col] = ''
        
# idx 열을 인덱스로 만들어줍시다
for i in range(4):
    know_train[i].index = know_train[i]['idx']
    know_train[i].drop('idx',axis=1,inplace=True)
    know_test[i].index = know_test[i]['idx']
    know_test[i].drop('idx',axis=1,inplace=True)
    

In [2]:
pd.options.display.max_columns = None
know_train[0].head(2)

Unnamed: 0_level_0,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,aq5_2,aq6_1,aq6_2,aq7_1,aq7_2,aq8_1,aq8_2,aq9_1,aq9_2,aq10_1,aq10_2,aq11_1,aq11_2,aq12_1,aq12_2,aq13_1,aq13_2,aq14_1,aq14_2,aq15_1,aq15_2,aq16_1,aq16_2,aq17_1,aq17_2,aq18_1,aq18_2,aq19_1,aq19_2,aq20_1,aq20_2,aq21_1,aq21_2,aq22_1,aq22_2,aq23_1,aq23_2,aq24_1,aq24_2,aq25_1,aq25_2,aq26_1,aq26_2,aq27_1,aq27_2,aq28_1,aq28_2,aq29_1,aq29_2,aq30_1,aq30_2,aq31_1,aq31_2,aq32_1,aq32_2,aq33_1,aq33_2,aq34_1,aq34_2,aq35_1,aq35_2,aq36_1,aq36_2,aq37_1,aq37_2,aq38_1,aq38_2,aq39_1,aq39_2,aq40_1,aq40_2,aq41_1,aq41_2,bq1,bq2,bq3,bq4,bq5,bq5_1,bq6,bq7,bq8_1,bq8_2,bq8_3,bq9,bq10,bq11,bq12_1,bq12_2,bq12_3,bq12_4,bq12_5,bq13,bq14,bq15_1,bq15_2,bq15_3,bq16,bq17,bq18_1,bq18_2,bq18_3,bq18_4,bq18_5,bq18_6,bq18_7,bq19,bq20,bq21,bq22,bq23,bq24_1,bq24_2,bq24_3,bq24_4,bq24_5,bq24_6,bq24_7,bq24_8,bq25,bq26,bq27,bq28,bq29,bq35,bq36,bq37,bq38,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3,major,연구원,기술,고령화,카메라,엑셀,자료,공구,교육,서비스,캐드,정보,예상,전기,운영,전산,자동차,고객,침체,기사,기계화,불황,포화상태,통계,환자,현상,선생님,강사,장비,기계,마이크,오피스,훈련,경기,교수,실험,도면,크게,업종,지능,시설,기기,공무원,기업,습득,무전기,방송,학생,현재,인식,전화기,주부,변화,건설,수가,인구,사업,가위,자격증,자동화,안전교육,영향,포토샵,파워포인트,이해,사무직,교사,연구,장갑,안전,지속,기구,정책,설계,제품,전문의,정보처리기사,숙지,문화,정부,활성화,업체,사무원,드라이버,간호사,변동,망치,검사,직무교육,작업,프로그램,선호,자영업,실무교육,단기,컴퓨터,설비,인원,워드,전기기사,인공,개발,인터넷,볼펜,시장,일러스트,측정기,상품,거절,디자인,관리,분석,의료,한글,교재,시스템,회사원,노트북,생산,ubda_cnt,knowcode,description,error
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1
0,3,3,3,3,3,3,4,4,3,4,3,3,2,2,2,2,2,2,3,3,3,3,4,4,3,3,3,2,3,3,5,6,5,6,4,5,2,2,5,6,3,4,4,5,3,4,4,4,2,2,3,4,4,5,3,4,1,0,2,2,3,3,1,0,2,2,2,5,3,4,3,4,2,2,2,2,5,2,2,2,2,2,3,1,6,1,1,2,5,2,4,3,3,4,4,3,4,3,4,4,4,4,3,2,4,3,2,2,4,4,2,3,2,3,3,2,2,3,5,10,1,1,1,2,2,2,2,2,3,3,4,3,2,4,1,52,2,1,1,1,4000,0,2200,실업,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,825101,도장원(도장기조작원),0.0
1,4,5,4,5,3,4,3,4,3,4,1,0,1,0,1,0,1,0,2,3,3,4,3,4,4,5,2,2,3,4,3,4,3,4,3,4,1,0,2,3,1,0,3,4,3,4,2,3,1,0,3,4,2,3,3,4,1,0,1,0,3,4,1,0,3,4,3,4,1,0,3,4,3,4,3,4,2,2,1,0,1,0,13,1,3,1,2,0,6,5,4,3,3,4,4,2,3,3,4,4,3,3,3,2,3,3,4,3,4,2,2,4,2,2,4,2,3,3,6,50,2,2,2,2,2,2,2,2,4,3,4,3,2,2,1,38,4,1,1,1,0,0,2400,건축공학,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,140204,건축설비 기술자,0.0


# String_compare fitting

In [3]:
import jellyfish
from difflib import SequenceMatcher
from tqdm.notebook import tqdm
from sklearn.ensemble import ExtraTreesClassifier

methods = {'SequenceMatcher':['knowcode_','similarity_'],
           'levenshtein_distance':['knowcode_lev_','similarity_lev_'],
           'damerau':['knowcode_dlev_','similarity_dlev_'],
           'jaro_distance':['knowcode_jaro_','similarity_jaro_'],
           'jaro_winkler':['knowcode_jarow_','similarity_jarow_'],
           'hamming_distance':['knowcode_ham_','similarity_ham_']}
years = ["2017", "2018", "2019", "2020"] # 클래스 변수

compare_fit_data = know_test.copy()

et_train_data = know_train.copy()
et_test_data = know_test.copy()

In [None]:
col = 'major'
method = 'jaro_distance'

def similar(a, b):
    '''
    SequenceMatcher 방식을 사용할 경우 적용하는 함수임.
    '''
    return SequenceMatcher(None, a, b).ratio()

## STEP 1
# 어떤 열을 기준으로 fit할지 정하고, 그 열의 정합성을 테스트합니다. 올바르지 못한것은 et로 무조건 예측해야하므로 빼둡니다

# exception) 기준열이 공란이거나 description이 공란인 경우 simiarity 예측에서 제외
error_indice = []
for i in range(4):
    error_index = list(compare_fit_data[i].loc[compare_fit_data[i][col]=='', col].index)
    error_indice.extend(error_index)

# error_indice의 중복값 제거
error_indice = list(set(error_indice))

## STEP 2
# string_compare를 model을 정의하고, 진행해보자

def string_compare(know_data, col, description_dfs, method):
    '''
    적용하고자 하는 string_compare방법을 선택하고, 어떤 열을 기준으로 string의 유사도를 비교할 것인지 input으로 넣어주기.
    선택한 칼럼과 방법에 따라서 description열과 similarity를 비교하여 값을 도출함.
    
    '''

    data = know_data.copy()
    
    for i, year in enumerate(years):
        # iterate 4 years
        print(year)
        text_info_col = col
        print(text_info_col)
        text_info_list = list(data[i][text_info_col])
    
        knowcode_text_info_col = []
        similarity_text_info_col = []

        for possible_answer in tqdm(text_info_list):
            # iterate each string in know_train[i][sim_job, bef_job, ...]
            knowcode = "0"
            similarity = 0.0
            max_similarity_index = 0
            for descr_row in description_dfs[year].itertuples():
                # iterate each row in description_dfs[year]
                if possible_answer != '':
                    if method == 'SequenceMatcher':
                        score = similar(possible_answer, descr_row.description)
                    elif method == 'levenshtein_distance':
                        score = jellyfish.levenshtein_distance(possible_answer, descr_row.description)
                    elif method == 'damerau':
                        score = jellyfish.damerau_levenshtein_distance(possible_answer, descr_row.description)
                    elif method == 'jaro_distance':
                        score = jellyfish.jaro_distance(possible_answer, descr_row.description)
                    elif method == 'jaro_winkler':
                        score = jellyfish.jaro_winkler(possible_answer, descr_row.description)
                    elif method == 'hamming_distance':
                        score = jellyfish.hamming_distance(possible_answer, descr_row.description)
                    
                    if score > similarity:
                        similarity = score
                        max_similarity_index = descr_row.Index
            if similarity == 0:
                knowcode_text_info_col.append("0")
            else:
                knowcode_text_info_col.append(description_dfs[year].iloc[max_similarity_index, 0])
            similarity_text_info_col.append(similarity)
        data[i][methods[method][0] + text_info_col] = knowcode_text_info_col
        data[i][methods[method][1] + text_info_col] = similarity_text_info_col
        
    return data

compare_data = string_compare(compare_fit_data, col, description_dfs, method)

# String_compare prediction

In [5]:
# predict
## string method로 맞춘 knowcode를 정답으로 배출합니다
col = 'major'
compare_data = compare_data
##

threshold = 0.99

compare_pred = {}
for i, year in enumerate(years):
    # string으로 prediction한 dataFrame인 string_predict_df 구하기
    sim_cols = [method + col for method in methods[method]]
    sims_df = compare_data[i][sim_cols]

    # string_predict_df에 대해서 error_index는 아닌지 체크해보기
    string_predict_tmp_indice = [idx for idx in sims_df.index]
    string_predict_pure_indice = list(set(string_predict_tmp_indice) - set(error_indice))

    # error가 아닌 index에 대해서 결과를 내보내기
    # 결과 데이터 프레임은 (index = idx, knowcode)인 df
    non_filtered_result_df = sims_df.loc[string_predict_pure_indice,[sim_cols[0],sim_cols[1]]]
    filter_result_df = non_filtered_result_df[non_filtered_result_df[sim_cols[1]] > threshold]\
                        .rename(columns = {sim_cols[0]:'knowcode'}) # simliarity에 대한 조건을 걸어줄 수 있음
    compare_pred[year] = filter_result_df['knowcode']

## et_fitting & prediction
> 메모리 문제로 인해 인스턴스를 연도마다 만들어서 학습하고 내보냈습니다

In [7]:
# train_data를 준비합니다
et_train_data_dict = {}
for year, df in zip(years, et_train_data):
    et_train_data_dict[year] = {'X': df.drop(['knowcode','error','description','major'], axis=1),
                                'y': df['knowcode']} 

In [8]:
# test data를 준비합니다
# compare_models에서 예측하지 않았던 idx로만 예측을 진행해야하므로, 그것들을 빼고 예측 test 데이터를 만듭니다
et_test_data_dict = {}
for year, df in zip(years, et_test_data):
    train_columns = et_train_data_dict[year]['X'].columns
    
    compare_indice = list(compare_pred[year].index)
    all_indice = list(df[train_columns].index)
    et_indice = list(set(all_indice) - set(compare_indice))
    et_df = df[train_columns].loc[et_indice,:]
    
    et_test_data_dict[year] = et_df

In [10]:
# 2017
clf2017 = ExtraTreesClassifier(n_estimators=300, random_state = RANDOM_STATE)
clf2017.fit(et_train_data_dict['2017']['X'], et_train_data_dict['2017']['y'])

ExtraTreesClassifier(n_estimators=300, random_state=42)

In [11]:
et_indice_2017 = list(et_test_data_dict['2017'].index)
pred2017 = clf2017.predict(et_test_data_dict['2017'])

pred2017_df = pd.DataFrame()
pred2017_df['idx'] = et_indice_2017
pred2017_df['knowcode'] = pred2017

In [12]:
pred2017_df.to_csv('pred2017_df.csv',index=False)

In [7]:
# 2018
clf2018 = ExtraTreesClassifier(n_estimators=300, random_state = RANDOM_STATE)
clf2018.fit(et_train_data_dict['2018']['X'], et_train_data_dict['2018']['y'])

ExtraTreesClassifier(n_estimators=300, random_state=42)

In [8]:
et_indice_2018 = list(et_test_data_dict['2018'].index)
pred2018 = clf2018.predict(et_test_data_dict['2018'])

pred2018_df = pd.DataFrame()
pred2018_df['idx'] = et_indice_2018
pred2018_df['knowcode'] = pred2018

In [9]:
pred2018_df.to_csv('pred2018_df.csv',index=False)

In [7]:
# 2019
clf2019 = ExtraTreesClassifier(n_estimators=300, random_state = RANDOM_STATE)
clf2019.fit(et_train_data_dict['2019']['X'], et_train_data_dict['2019']['y'])

ExtraTreesClassifier(n_estimators=300, random_state=42)

In [8]:
et_indice_2019 = list(et_test_data_dict['2019'].index)
pred2019 = clf2019.predict(et_test_data_dict['2019'])

pred2019_df = pd.DataFrame()
pred2019_df['idx'] = et_indice_2019
pred2019_df['knowcode'] = pred2019

In [9]:
pred2019_df.to_csv('pred2019_df.csv',index=False)

In [9]:
# 2020
clf2020 = ExtraTreesClassifier(n_estimators=300, random_state = RANDOM_STATE)
clf2020.fit(et_train_data_dict['2020']['X'], et_train_data_dict['2020']['y'])

ExtraTreesClassifier(n_estimators=300, random_state=42)

In [10]:
et_indice_2020 = list(et_test_data_dict['2020'].index)
pred2020 = clf2020.predict(et_test_data_dict['2020'])

pred2020_df = pd.DataFrame()
pred2020_df['idx'] = et_indice_2020
pred2020_df['knowcode'] = pred2020

In [11]:
pred2020_df.to_csv('pred2020_df.csv',index=False)

# result concatenate

In [12]:
pred2017_df = pd.read_csv('pred2017_df.csv')
pred2017_df.index = pred2017_df['idx']
pred2017_df.drop('idx',axis=1,inplace=True)

pred2018_df = pd.read_csv('pred2018_df.csv')
pred2018_df.index = pred2018_df['idx']
pred2018_df.drop('idx',axis=1,inplace=True)

pred2019_df = pd.read_csv('pred2019_df.csv')
pred2019_df.index = pred2019_df['idx']
pred2019_df.drop('idx',axis=1,inplace=True)

pred2020_df = pd.read_csv('pred2020_df.csv')
pred2020_df.index = pred2020_df['idx']
pred2020_df.drop('idx',axis=1,inplace=True)

In [13]:
et_pred = [pred2017_df['knowcode'], pred2018_df['knowcode'], pred2019_df['knowcode'], pred2020_df['knowcode']]

In [14]:
## STEP 3 et_pred와 compare_pred를 합쳐줘야합니다
final_pred = []
for i, year in enumerate(years):
    final_pred_df = pd.concat([compare_pred[year],et_pred[i]]).sort_index()
    pred = list(final_pred_df)
    final_pred.extend(pred)

In [15]:
submission = pd.read_csv('../data_0103/sample_submission.csv') # sample submission 불러오기
submission['knowcode'] = final_pred
submission.to_csv('../submission_files/et_300_string_compare_with_data_0112_para99.csv', index=False)