# et_tf_idf_matrix_pca_Model

1. 데이터  
    * data_0119를 사용함  
    * data_0105에 있는 text_response열을 추가함
    * 기타 categorical 열에 대해 라벨인코딩 진행
# 
2. pca after getting tf-idf matrix  
    * STEP 1 : text_response열에 대한 tf_idf_matrix를 구함  
    * STEP 2 : tf_idf_matrix를 2차원으로 pca해서 train_data에 append 함
#      
3. fitting & Prediction  
    * et_300으로 data를 학습하고, 예측함

# 라이브러리 import & 데이터 로딩 & 전처리

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from glob import glob
import warnings
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

warnings.filterwarnings(action='ignore') 
pd.options.display.max_columns=None

know_train = [pd.read_csv(path) for path in sorted(glob('../data_0119/train/*.csv'))]
know_test = [pd.read_csv(path) for path in sorted(glob('../data_0119/test/*.csv'))]

other_train = [pd.read_csv(path) for path in sorted(glob('../data_0105/train/*.csv'))]
other_test = [pd.read_csv(path) for path in sorted(glob('../data_0105/test/*.csv'))]

# data_0119에 text_response 추가해주기
for num in range(4):
    know_train[num]['text_response'] = other_train[num]['text_response']
    know_test[num]['text_response'] = other_test[num]['text_response']
    know_train[num]['text_response'] = know_train[num]['text_response'] + ' ' + know_train[num]['major']
    know_test[num]['text_response'] = know_test[num]['text_response'] + ' ' + know_test[num]['major']

text_info_cols = ['text_response']

## train
for i in range(4):
    for text_info_col in text_info_cols:
        know_train[i].loc[know_train[i][text_info_col]=='없다', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='없음', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='0', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='무', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='모름', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='공란', text_info_col] = ''

## test  
for i in range(4):
    for text_info_col in text_info_cols:
        know_test[i].loc[know_test[i][text_info_col]=='없다', text_info_col] = ''
        know_test[i].loc[know_test[i][text_info_col]=='없음', text_info_col] = ''
        know_test[i].loc[know_test[i][text_info_col]=='0', text_info_col] = ''
        know_test[i].loc[know_test[i][text_info_col]=='무', text_info_col] = ''
        know_test[i].loc[know_test[i][text_info_col]=='모름', text_info_col] = ''
        know_test[i].loc[know_test[i][text_info_col]=='공란', text_info_col] = ''

# encode train data
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns.difference(['text_response']):
        if col == 'idx':
            print("HEHE")
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

# encode test data
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns.difference(['text_response']):
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리


2017
HEHE
2018
HEHE
2019
HEHE
2020
HEHE
2017
2018
2019
2020


# pca after getting tf-idf matrix 

In [3]:
def get_tf_idf_df(know_data):
    
    doc_nouns_list = [doc for doc in (know_data['text_response'])]
    
    tfidf_vectorizer = TfidfVectorizer(min_df=1)
    tfidf_matrix = pd.DataFrame(tfidf_vectorizer.fit_transform(doc_nouns_list).toarray())
    
    return tfidf_matrix

def use_pca(preprocessed_data, n_component):

    # 평균이 0이 되도록 조정
    data_scaled = StandardScaler().fit_transform(preprocessed_data)
    # PCA
    pca = PCA(n_components=n_component)
    
    data_pca = pd.DataFrame(pca.fit_transform(data_scaled), columns = range(n_component))
    
    return data_pca

In [8]:
## 데이터 준비

years = ['2017','2018','2019','2020']
train_data = {}
test_data = {}
n_components = {}

print('train data 준비')
for year, df in tqdm(zip(years, know_train)):
    
    n_component = 2
    
    tf_idf_matrix = get_tf_idf_df(df)
    
    tf_idf_pca = use_pca(tf_idf_matrix, n_component)
    
    train_pca = pd.concat([df,tf_idf_pca],axis=1).drop(['idx','knowcode','text_response','description'], axis = 1)
    
    train_data[year] = {'X': train_pca, # idx 제외
                        'y': df['knowcode']}
    
    n_components[year] = n_component
    
print('train data 완료')
print('--------')
print('test_data 준비')    
for year, df in tqdm(zip(years, know_test)):
    
    n_component = n_components[year]
    
    tf_idf_matrix = get_tf_idf_df(df)
    
    tf_idf_pca = use_pca(tf_idf_matrix, n_component)
    
    test_pca = pd.concat([df,tf_idf_pca],axis=1).drop(['idx','text_response'], axis = 1)
    
    test_data[year] = test_pca

print('test_data 완료') 

train data 준비


4it [00:39,  9.84s/it]


train data 완료
--------
test_data 준비


4it [00:43, 10.83s/it]

test_data 완료





# fitting & Prediction  

In [10]:
## 모델 학습
RANDOM_STATE = 42    
from sklearn.ensemble import ExtraTreesClassifier

et_predicts = [] 
for year in tqdm(years):
    # train
    model = ExtraTreesClassifier(n_estimators=400, random_state=RANDOM_STATE, n_jobs=8)
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])

    # predict
    pred = model.predict(test_data[year])
    et_predicts.extend(pred)

100%|██████████| 4/4 [07:58<00:00, 119.73s/it]


In [7]:
submission = pd.read_csv('../data_0103/sample_submission.csv') # sample submission 불러오기
submission['knowcode'] = et_predicts
submission.to_csv('et_400_data0119_append_tf_idf_matrix_pca.csv', index=False)