# Soft Voting with et_400 and catboost_300

1. 데이터  
    * data_0119 사용함  
    * label encoding  
#
2. Ensemble  
    * et_400으로 Modeling하고 prediction을 도출함
    * catboost_300으로 Modeling하고 predicion을 도출함
    * prediction에 대한 proba를 비교하여 더 높은 예측값을 채택함

# 라이브러리 import & 데이터 로딩 & 전처리

In [1]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.seed = 42
DATA_PATH = "../data_0119/"

warnings.filterwarnings(action='ignore')
PATH_2017 = DATA_PATH + "train/KNOW_2017.csv"
PATH_2018 = DATA_PATH + "train/KNOW_2018.csv"
PATH_2019 = DATA_PATH + "train/KNOW_2019.csv"
PATH_2020 = DATA_PATH + "train/KNOW_2020.csv"

paths = [PATH_2017, PATH_2018, PATH_2019, PATH_2020]
know_train = [pd.read_csv(path) for path in paths]

TEST_PATH_2017 = DATA_PATH + "test/KNOW_2017_test.csv"
TEST_PATH_2018 = DATA_PATH + "test/KNOW_2018_test.csv"
TEST_PATH_2019 = DATA_PATH + "test/KNOW_2019_test.csv"
TEST_PATH_2020 = DATA_PATH + "test/KNOW_2020_test.csv"

TEST_PATHs = [TEST_PATH_2017, TEST_PATH_2018, TEST_PATH_2019, TEST_PATH_2020]
know_test = [pd.read_csv(path) for path in TEST_PATHs]

years = ['2017', '2018', '2019', '2020']

from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'idx':
            print("HEHE")
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

# encode test data
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리

train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.drop(['idx','knowcode','description'], axis=1),
                        'y': df['knowcode']}

test_data = {}
for year, df in zip(years, know_test):
    print(year)
    train_columns = train_data[year]['X'].columns
    test_data[year] =  {'X': df[train_columns]} 

for year in years:
    print(year)
    print(f"train: {train_data[year]['X'].shape} test: {test_data[year]['X'].shape}")

# Ensemble

In [49]:
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier

ensemble_preds = []
for year in years:
    print(year)
    #####################################################################################################
    print(' et modeling...')
    model = ExtraTreesClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=8)
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])

    print(' et prediction...')
    # 예측하기
    et_prob_df = pd.DataFrame(model.predict_proba(test_data[year]['X']), columns=model.classes_)

    # prop 구하기
    et_result_prob = list(et_prob_df.apply(lambda x: x.max(),axis=1))

    # class 구하기
    et_result_class = []
    for index in range(test_data[year]['X'].shape[0]):
        et_prop_index_series = et_prob_df.loc[index,:]
        et_class = list(et_prop_index_series.loc[et_prop_index_series==et_result_prob[index]].index)[0]
        et_result_class.append(et_class)

    # result 내기
    et_results_df = pd.DataFrame(index=range(test_data[year]['X'].shape[0]))
    et_results_df['prop'] = et_result_prob
    et_results_df['label'] = et_result_class

    #####################################################################################################
    
    print(' cat modeling...')
    model = CatBoostClassifier(iterations=300,
                            random_state=RANDOM_STATE,
                            task_type='GPU',
                            loss_function='MultiClass',
                            eval_metric='TotalF1'
                            )
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])

    print(' cat prediction...')
    # 예측하기
    cat_prob_df = pd.DataFrame(model.predict_proba(test_data[year]['X']), columns=model.classes_)

    # prop 구하기
    cat_result_prob = list(cat_prob_df.apply(lambda x: x.max(),axis=1))

    # class 구하기
    cat_result_class = []
    for index in range(test_data[year]['X'].shape[0]):
        cat_prop_index_series = cat_prob_df.loc[index,:]
        cat_class = list(cat_prop_index_series.loc[cat_prop_index_series==cat_result_prob[index]].index)[0]
        cat_result_class.append(cat_class)

    # result 내기
    cat_results_df = pd.DataFrame(index=range(test_data[year]['X'].shape[0]))
    cat_results_df['prop'] = cat_result_prob
    cat_results_df['label'] = cat_result_class
    #####################################################################################################
    year_pred = []
    for index, (et_prop, cat_prop) in enumerate(zip(et_results_df['prop'], cat_results_df['prop'])):
        if et_prop <= cat_prop:
            year_pred.append(cat_results_df.loc[index,'label'])
        else:
            year_pred.append(et_results_df.loc[index,'label'])
    
    ensemble_preds.extend(year_pred)
    print('##'*20)

et modeling...
et prediction...
cat modeling...
Learning rate set to 0.5
0:	learn: 0.0246571	total: 11.6s	remaining: 11.6s
1:	learn: 0.0305460	total: 23s	remaining: 0us
cat prediction...
et modeling...
et prediction...
cat modeling...
Learning rate set to 0.5
0:	learn: 0.0131363	total: 10.6s	remaining: 10.6s
1:	learn: 0.0259929	total: 20.9s	remaining: 0us
cat prediction...
et modeling...
et prediction...
cat modeling...
Learning rate set to 0.5
0:	learn: 0.0083804	total: 10.8s	remaining: 10.8s
1:	learn: 0.0305945	total: 22s	remaining: 0us
cat prediction...
et modeling...
et prediction...
cat modeling...
Learning rate set to 0.5
0:	learn: 0.0101639	total: 10s	remaining: 10s
1:	learn: 0.0209163	total: 20.1s	remaining: 0us
cat prediction...


# 제출

In [51]:
submission = pd.read_csv('../data_0103/sample_submission.csv') # sample submission 불러오기
submission['knowcode'] = ensemble_preds

In [None]:
submission.to_csv('../submission_files/et_400_with_data_0119_dropcols.csv', index=False)