# 데이터 임포트

In [5]:
import pandas as pd
import numpy as np
from glob import glob
from tqdm import tqdm
import warnings

RANDOM_STATE = 42
np.seed = 42
DATA_PATH = "../data_0115/"

warnings.filterwarnings(action='ignore')
PATH_2017 = DATA_PATH + "train/KNOW_2017.csv"
PATH_2018 = DATA_PATH + "train/KNOW_2018.csv"
PATH_2019 = DATA_PATH + "train/KNOW_2019.csv"
PATH_2020 = DATA_PATH + "train/KNOW_2020.csv"

paths = [PATH_2017, PATH_2018, PATH_2019, PATH_2020]

know_train = [pd.read_csv(path) for path in paths]

TEST_PATH_2017 = DATA_PATH + "test/KNOW_2017_test.csv"
TEST_PATH_2018 = DATA_PATH + "test/KNOW_2018_test.csv"
TEST_PATH_2019 = DATA_PATH + "test/KNOW_2019_test.csv"
TEST_PATH_2020 = DATA_PATH + "test/KNOW_2020_test.csv"

TEST_PATHs = [TEST_PATH_2017, TEST_PATH_2018, TEST_PATH_2019, TEST_PATH_2020]

know_test = [pd.read_csv(path) for path in TEST_PATHs]

# String Compare

In [6]:
text_info_cols = {"2017": ['sim_job','bef_job','able_job','major'],
                  "2018": ['sim_job','bef_job','able_job','major'],
                  "2019": ['bef_job','able_job','major'],
                  "2020": ['major'],}

In [7]:
description_2017 = pd.read_csv('../data_pdf_description/pdf_description_2017.csv')
description_2018 = pd.read_csv('../data_pdf_description/pdf_description_2018.csv')
description_2019 = pd.read_csv('../data_pdf_description/pdf_description_2019.csv')
description_2020 = pd.read_csv('../data_pdf_description/pdf_description_2020.csv')
description_dfs = {"2017": description_2017,
                  "2018": description_2018,
                  "2019": description_2019,
                  "2020": description_2020,}

In [8]:
years = ["2017", "2018", "2019", "2020"]

for i, year in enumerate(years):
    for text_info_col in text_info_cols[year]:
        know_train[i].loc[know_train[i][text_info_col]=='없다', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='없음', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='0', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='무', text_info_col] = ''
        know_train[i].loc[know_train[i][text_info_col]=='모름', text_info_col] = ''

In [9]:
!pip install jellyfish

Collecting jellyfish
  Downloading jellyfish-0.9.0-cp38-cp38-win_amd64.whl (26 kB)
Installing collected packages: jellyfish
Successfully installed jellyfish-0.9.0


In [11]:
import jellyfish
from difflib import SequenceMatcher
from tqdm.notebook import tqdm

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [12]:
years = ["2017", "2018", "2019", "2020"]
methods = {'SequenceMatcher':['knowcode_','similarity_'],
           'levenshtein_distance':['knowcode_lev_','similarity_lev_'],
           'damerau':['knowcode_dlev_','similarity_dlev_'],
           'jaro_distance':['knowcode_jaro_','similarity_jaro_'],
           'jaro_winkler':['knowcode_jarow_','similarity_jarow_'],
           'hamming_distance':['knowcode_ham_','similarity_ham_']}

def string_compare(know_data, text_info_cols, description_dfs, method):

    data = know_data.copy()
    
    for i, year in enumerate(years):
        # iterate 4 years
        print(year)
        for text_info_col in text_info_cols[year]:
            print(text_info_col)
            # iterate sim_job, bef_job, ...
            text_info_list = list(data[i][text_info_col])
        
            knowcode_text_info_col = []
            similarity_text_info_col = []

            for possible_answer in tqdm(text_info_list):
                # iterate each string in know_train[i][sim_job, bef_job, ...]
                knowcode = "0"
                similarity = 0.0
                max_similarity_index = 0
                for descr_row in description_dfs[year].itertuples():
                    # iterate each row in description_dfs[year]
                    if possible_answer != '':
                        if method == 'SequenceMatcher':
                            score = similar(possible_answer, descr_row.description)
                        elif method == 'levenshtein_distance':
                            score = jellyfish.levenshtein_distance(possible_answer, descr_row.description)
                        elif method == 'damerau':
                            score = jellyfish.damerau_levenshtein_distance(possible_answer, descr_row.description)
                        elif method == 'jaro_distance':
                            score = jellyfish.jaro_distance(possible_answer, descr_row.description)
                        elif method == 'jaro_winkler':
                            score = jellyfish.jaro_winkler(possible_answer, descr_row.description)
                        elif method == 'hamming_distance':
                            score = jellyfish.hamming_distance(possible_answer, descr_row.description)
                        
                        if score > similarity:
                            similarity = score
                            max_similarity_index = descr_row.Index
                if similarity == 0:
                    knowcode_text_info_col.append("0")
                else:
                    knowcode_text_info_col.append(description_dfs[year].iloc[max_similarity_index, 0])
                similarity_text_info_col.append(similarity)
            data[i][methods[method][0] + text_info_col] = knowcode_text_info_col
            data[i][methods[method][1] + text_info_col] = similarity_text_info_col
        
    return data

In [None]:
# 6가지의 방법을 통해 결과를 구해봅시다
# 한 시간정도 걸리네요(6가지 방법 x 4개 연도 = 24개의 df를 생성하는 것이니 꽤 걸려여)

six_method_df_list = []
for method in tqdm(methods.keys()):
    data = string_compare(know_train, text_info_cols, description_dfs, method)
    six_method_df_list.append(data)

In [29]:
# 데이터 프레임에 각 연도별, 방법별 칼럼을 저장합니다
# 중요한 모든 칼럼을 가지고 있음
year_methods_df = pd.DataFrame(index=[years],columns=list(methods.keys()))

for method in methods.keys():
    for year in years:
        value_list = []
        text_cols = text_info_cols[year]
        methods_cols = methods[method]
        for text_col in text_cols:
            for methods_col in methods_cols:
                value_list.append(methods_col + text_col)
        value_list.append('knowcode')
        value_string = ''
        for value in value_list:
            value_string += value
            value_string += ' '
        year_methods_df.loc[year, method] = value_string[:-1]


In [31]:
# 맞춘 개수를 연도별로, 방법별로 모아봅시다.
Accuracy_dict = {}
for method in methods.keys():
    Accuracy_dict[method] = {}
for method in methods.keys():
    for year in years:
       Accuracy_dict[method][year] = pd.DataFrame() 

In [70]:
# 연도와 방법을 순회하면서 몇 개를 맞췄는지 비교해봅니다
# Accuracy_dict에 결과를 담을것이고, Structure는 다음과 같습니다
# method 선정 -> year 4개년도 순회 -> 각 연도별로 많이 맞춘 컬럼찾기
# {method_1 : {year 1 : [dataFrame(cols = [text_info_cols], index = Accuracy)],
#              year 2 : [dataFrame(cols = [text_info_cols], index = Accuracy)],
#              year 3 : [dataFrame(cols = [text_info_cols], index = Accuracy)],
#              year 4 : [dataFrame(cols = [text_info_cols], index = Accuracy)]}....}
def get_best_method(df):
    df_T = df.T
    max_p = list(df_T.max())[0]
    best_col = list(df_T[df_T['Accuracy']==max_p].index)[0]
    return best_col

for i, method in enumerate(methods.keys()):
    data = six_method_df_list[i]
    for k, year in enumerate(years):
        accuracy_cols_string = list(year_methods_df.loc[year, method])[0] # 필요한 칼럼들을 가져옵니다
        accuracy_cols = accuracy_cols_string.split(' ')
        result_df = pd.DataFrame(index=['Accuracy'],columns=accuracy_cols) # 데이터 프레임 준비
        
        for col in accuracy_cols: # Accuracy 구하기
            accuracy = data[k].loc[data[k]['knowcode']==data[k][col]].shape[0]
            result_df[col] = accuracy
        
        result_df = result_df.drop('knowcode',axis=1)
        best_col = get_best_method(result_df)
        result_df['best_col'] = best_col
        Accuracy_dict[method][year] = result_df

In [77]:
# 결과 요약 : 가장 성능이 좋았던 칼럼 찾기
best_col_df = pd.DataFrame(index=[years],columns=list(methods.keys()))

for year in years:
    for method in methods:
        best_col_df.loc[year,method] = list(Accuracy_dict[method][year]['best_col'])[0]
        
# 결과 요약 : 성능이 좋았던 칼럼이 맞춘 개수 찾기
best_acc_df = pd.DataFrame(index=[years],columns=list(methods.keys()))

for year in years:
    for method in methods:
        best_col = list(Accuracy_dict[method][year]['best_col'])[0]
        best_acc_df.loc[year,method] = list(Accuracy_dict[method][year][best_col])[0]

In [78]:
best_col_df

Unnamed: 0,SequenceMatcher,levenshtein_distance,damerau,jaro_distance,jaro_winkler,hamming_distance
2017,knowcode_major,knowcode_lev_major,knowcode_dlev_major,knowcode_jaro_major,knowcode_jarow_major,knowcode_ham_major
2018,knowcode_major,knowcode_lev_sim_job,knowcode_dlev_sim_job,knowcode_jaro_major,knowcode_jarow_major,knowcode_ham_sim_job
2019,knowcode_major,knowcode_lev_major,knowcode_dlev_major,knowcode_jaro_major,knowcode_jarow_major,knowcode_ham_major
2020,knowcode_major,knowcode_lev_major,knowcode_dlev_major,knowcode_jaro_major,knowcode_jarow_major,knowcode_ham_major


In [79]:
best_acc_df

Unnamed: 0,SequenceMatcher,levenshtein_distance,damerau,jaro_distance,jaro_winkler,hamming_distance
2017,692,9,9,689,696,15
2018,524,0,0,724,705,0
2019,512,14,14,703,694,15
2020,703,12,12,701,700,15
