In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
import warnings
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
warnings.filterwarnings(action='ignore') 
pd.options.display.max_columns=None

know_train = [pd.read_csv(path) for path in sorted(glob('../data_0112/train/*.csv'))]
know_test = [pd.read_csv(path) for path in sorted(glob('../data_0112/test/*.csv'))]

other_train = [pd.read_csv(path) for path in sorted(glob('../data_0105/train/*.csv'))]
other_test = [pd.read_csv(path) for path in sorted(glob('../data_0105/test/*.csv'))]

for num in range(4):
    know_train[num]['text_response'] = other_train[num]['text_response']
    know_train[num]['major'] = other_train[num]['major']
    know_test[num]['text_response'] = other_test[num]['text_response']
    know_test[num]['major'] = other_test[num]['major']

In [3]:
def isEnglishOrKorean(input_s):
    k_count = 0
    e_count = 0
    for c in input_s:
        if ord('가') <= ord(c) <= ord('힣'):
            k_count+=1
        elif ord('a') <= ord(c.lower()) <= ord('z'):
            e_count+=1
    return "k" if k_count>1 else "e"

In [4]:
# 전체적인 오류 0값의 분포를 봅시다
years = ['2017','2018','2019','2020']


def fill_mean(x):
    
    filled_x = x.copy()
    
    zero_indice = filled_x.loc[filled_x==0].index
    non_zero_indice = filled_x.loc[filled_x!=0].index
    mean_values = round(filled_x.loc[non_zero_indice].mean(),0)
    if len(zero_indice) == 0:
        return x
    else:
        filled_x.loc[zero_indice] = mean_values
        return filled_x

def fill_mode(x):
    filled_x = x.copy()
    
    zero_indice = filled_x.loc[filled_x==0].index
    non_zero_indice = filled_x.loc[filled_x!=0].index
    mode_values = round(filled_x.loc[non_zero_indice].mode(),0)[0]
    if len(zero_indice) == 0:
        return x
    else:
        filled_x.loc[zero_indice] = mode_values
        return filled_x


def data_imputation(skip_cols_year, numeric_pure_cols_year, train_data, test_data):

    skip_txt_col_train = [col for col in train_data.columns if isEnglishOrKorean(col) == 'k'] + ['major','knowcode','idx','text_response','ubda_cnt']
    error_cols_year_train = [col for col in train_data.columns if col not in skip_cols_year + skip_txt_col_train]
    skip_txt_col_test = [col for col in test_data.columns if isEnglishOrKorean(col) == 'k'] + ['major','knowcode','idx','text_response','ubda_cnt']
    error_cols_year_test = [col for col in test_data.columns if col not in skip_cols_year + skip_txt_col_test]
    
    mean_fill_cols_train = []
    mode_fill_cols_train = []
    mean_fill_cols_test = []
    mode_fill_cols_test = []

    for col in error_cols_year_train:
        if col in numeric_pure_cols_year:
            mean_fill_cols_train.append(col)
        else:
            mode_fill_cols_train.append(col)
            
    for col in error_cols_year_test:
        if col in numeric_pure_cols_year:
            mean_fill_cols_test.append(col)
        else:
            mode_fill_cols_test.append(col)
            
    for col in mean_fill_cols_train:
        train_data[col] = fill_mean(train_data[col])
        
    for col in mode_fill_cols_train:
        train_data[col] = fill_mode(train_data[col])
        
    for col in mean_fill_cols_test:
        test_data[col] = fill_mode(test_data[col])
        
    for col in mode_fill_cols_test:
        test_data[col] = fill_mode(test_data[col])

    return train_data, test_data

## 2017
# 설문지에서 건너뛰어도 된다고 말한 문항
skip_cols_2017 = ['aq1_2', 'aq2_2', 'aq3_2', 'aq4_2', 'aq5_2', 'aq6_2', 'aq7_2', 'aq8_2', 'aq9_2', 'aq10_2'
                    ,'aq11_2', 'aq12_2', 'aq13_2', 'aq14_2', 'aq15_2', 'aq16_2', 'aq17_2', 'aq18_2', 'aq19_2', 'aq20_2'
                    ,'aq21_2', 'aq22_2', 'aq23_2', 'aq24_2', 'aq25_2', 'aq26_2', 'aq27_2', 'aq28_2', 'aq29_2', 'aq30_2'
                    ,'aq31_2', 'aq32_2', 'aq33_2', 'aq34_2', 'aq35_2', 'aq36_2', 'aq37_2', 'aq38_2', 'aq39_2', 'aq40_2'
                    ,'aq41_2','bq5_1', 'bq40','bq41_1', 'bq41_2', 'bq41_3']
numeric_pure_cols_2017 = ['bq23', 'bq37', 'bq41_1', 'bq41_2', 'bq41_3',]

## 2018
# 설문지에서 건너뛰어도 된다고 말한 문항
skip_cols_2018 = ['bq5_1','bq25_1','bq39','bq40','bq41_1','bq41_2','bq41_3']
numeric_pure_cols_2018 = ['bq21', 'bq36', 'bq40', 'bq41_1', 'bq41_2', 'bq41_3', ]

## 2019
# 설문지에서 건너뛰어도 된다고 말한 문항
skip_cols_2019 = ['kq1_2', 'kq2_2', 'kq3_2', 'kq4_2', 'kq5_2', 'kq6_2', 'kq7_2', 'kq8_2', 'kq9_2', 'kq10_2'
                ,'kq11_2', 'kq12_2', 'kq13_2', 'kq14_2', 'kq15_2','kq16_2', 'kq17_2', 'kq18_2', 'kq19_2','kq20_2'
                ,'kq21_2', 'kq22_2', 'kq23_2', 'kq24_2', 'kq25_2','kq26_2', 'kq27_2' 'kq28_2', 'kq29_2', 'kq30_2'
                ,'kq31_2', 'kq32_2','kq33_2','bq5_1','bq29','bq30','bq31_1','bq31_2','bq31_3'
                ]
numeric_pure_cols_2019 = ['bq26', 'bq30', 'bq31_1', 'bq31_2', 'bq31_3', ]

## 2020
# 설문지에서 건너뛰어도 된다고 말한 문항
skip_cols_2020 = ['saq1_2', 'saq2_2', 'saq3_2', 'saq4_2', 'saq5_2','saq6_2', 'saq7_2', 'saq8_2', 'saq9_2', 'saq10_2'
                 ,'saq11_2', 'saq12_2', 'saq13_2', 'saq14_2','saq15_2', 'saq16_2', 'saq17_2', 'saq18_2', 'saq19_2'
                 ,'saq20_2', 'saq21_2', 'saq22_2', 'saq23_2', 'saq24_2', 'saq25_2', 'saq26_2', 'saq27_2', 'saq28_2', 'saq29_2'
                 ,'saq30_2', 'saq31_2', 'saq32_2', 'saq33_2', 'saq34_2', 'saq35_2','saq36_2', 'saq37_2', 'saq38_2' 
                 , 'saq39_2', 'saq40_2', 'saq41_2', 'saq42_2',  'saq43_2', 'saq44_2' 
                 ,'bq5_1','bq28','bq29','bq30_1','bq30_2','bq30_3'
                ]
numeric_pure_cols_2020 = ['bq25', 'bq29', 'bq30_1', 'bq30_2', 'bq30_3', ]


## 내보내기
years = ['2017','2018','2019','2020']
skip_cols_list = [skip_cols_2017, skip_cols_2018, skip_cols_2019, skip_cols_2020]
numeric_pure_cols_list = [numeric_pure_cols_2017, numeric_pure_cols_2018, numeric_pure_cols_2019,numeric_pure_cols_2020]

for idx in range(4):
    train_data, test_data = data_imputation(skip_cols_list[idx], numeric_pure_cols_list[idx], know_train[idx], know_test[idx])
    know_train[idx] = train_data
    know_test[idx] = test_data

In [5]:
# STEP 1 : 같은 연도인 train_set, test_set으로 sim_df 만들기
# input
def get_tf_idf_sim_mat(train_data, test_data):

    total_nouns_list = []

    doc_nouns_list_train = [doc for doc in (train_data['text_response'] +' '+ train_data['major'])]
    doc_nouns_list_test = [doc for doc in (test_data['text_response'] +' '+ test_data['major'])]

    total_nouns_list.extend(doc_nouns_list_train)
    total_nouns_list.extend(doc_nouns_list_test)

    stopwords = ['없다', '공란', '0']
    tfidf_vectorizer = TfidfVectorizer(min_df=1,stop_words=stopwords)
    tfidf_matrix = tfidf_vectorizer.fit_transform(total_nouns_list)
    doc_nouns_similarities = (tfidf_matrix * tfidf_matrix.T)
    sim_df = pd.DataFrame(doc_nouns_similarities.toarray())
    
    return sim_df

# STEP 2 : test_set의 특정 레코드와 가장 비슷한 레코드를 train_set에서 찾고 그 레코드의 knowcode를 결과값으로 내놓기
def sim_pred(idx, sim_df, threshold, train_data):
    
    train_len = train_data.shape[0]
    
    test_set_record = idx + train_len   # sim_df에서 test_set의 index를 계산
    sim_rank_series = sim_df.loc[test_set_record].sort_values(ascending=False) 
    filtered_sim_rank_series = sim_rank_series.loc[sim_rank_series.index < train_len]  # test_set_record와 가장 비슷한 train_set_record를 계산

    target_index = list(filtered_sim_rank_series.head(1).index)[0]
    target_similarity = list(filtered_sim_rank_series.head(1).values)[0]

    if target_similarity > threshold:   # 조건을 만족한다면 유사한 train_set의 knowcode와 같은 것으로 예측
        pred = train_data.loc[target_index,'knowcode']
        
    else:   # 조건을 만족하지 않는다면 건너뛰었다는 의미로 0을 반환
        pred = 0
    
    return pred

In [6]:
# model을 먼저 도출하자
years = ['2017','2018','2019','2020']
train_data = {}

for year, df in zip(years, know_train):
    train_data[year] = {'X': df.drop(['text_response','idx','major','knowcode'], axis=1),
                        'y': df['knowcode']}
    
RANDOM_STATE = 42
et_models = {}

for year in tqdm(years):
    model = ExtraTreesClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=8)
    model.fit(train_data[year]['X'], train_data[year]['y'])
    et_models[year] = model

100%|██████████| 4/4 [00:37<00:00,  9.45s/it]


In [7]:
# test data 준비하기
test_data = {}

for year, df in zip(years, know_test):
    train_columns = train_data[year]['X'].columns
    test_data[year] =  {'X': df[train_columns]}

In [8]:
## 예측을 진행해보자

sim_predicts = [] 
threshold = 0.4

for year, num in tqdm(zip(years,range(4))):
    sim_df = get_tf_idf_sim_mat(know_train[num], know_test[num])    # sim_df를 먼저 구해주고
    sim_predict = []
    
    for idx in range(test_data[year]['X'].shape[0]):
        pred = sim_pred(idx, sim_df, threshold, know_train[num])
        sim_predict.append(pred)

    sim_predicts.extend(sim_predict)

4it [01:33, 23.50s/it]


In [9]:
submission = pd.read_csv('../data_0103/sample_submission.csv') # sample submission 불러오기
submission['knowcode'] = sim_predicts

In [10]:
non_sim_predicts = []
idx_start = 0
idx_end = 0

final_submission = pd.DataFrame()
for year in tqdm(years):
    test_len = test_data[year]['X'].shape[0] - 1
    idx_end = idx_start + test_len
    submission_year = submission.loc[idx_start:idx_end,:].reset_index(drop=True)
    idx_start = idx_start + test_len + 1
    
    zero_indice = submission_year[submission_year['knowcode']==0].index     # 이전 예측에서 0으로 지정했던 데이터의 인덱스
    
    test_data_zero = test_data[year]['X'].loc[zero_indice,:]    # 그것들만 따로 추려서 model에 넣어 예측해주고
    pred = et_models[year].predict(test_data_zero)
    
    submission_year.loc[zero_indice,'knowcode'] = pred      # submission에 채워줍니다
    final_submission = pd.concat([final_submission, submission_year])
    
final_submission = final_submission.reset_index(drop=True)

100%|██████████| 4/4 [00:06<00:00,  1.75s/it]


In [11]:
final_submission.to_csv('et_data0112_tf_idf_matrix_sim_ver5.csv', index=False)