In [2]:
from konlpy.tag import Kkma
import pandas as pd
import tensorflow as tf
import enum
import os
import re
from sklearn.model_selection import train_test_split
import numpy as np
from configs import DEFINES


PAD = "<PAD>"
STD = "<SOS>"
END = "<END>"
UNK = "<UNK>"

PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, STD, END, UNK]

ModuleNotFoundError: No module named 'configs'

In [3]:
def load_data():
    db = pd.read_csv("./data_in/ChatBotData.csv")
    train, test = train_test_split(db)
    train_q, train_a = train["Q"], train["A"]
    test_q, test_a = test["Q"], test["A"]
    
    return train_q, train_a, test_q, test_a

In [5]:
def prepro_noise_canceling(data):
    data = re.sub("[~.,!?\"':;)(]", "", data)

    return data

In [6]:
def tokenizing_data(data):
    kkma = Kkma()
    tokens_list = []
    
    for d in tqdm(data):
        tokens = " ".join(kkma.morphs(d.replace(" ", "")))
        tokens_list.append(tokens)
    
    return tokens_list

In [7]:
def enc_processing(value, dictionary):
    seq_input_index = []
    seq_len = []
    
    value = prepro_noise_canceling(value)
    
    for seq in value:
        
        seq_index = []
        
        for word in seq.split():
            if dictionary.get(word) is not None:
                seq_index.extend(dictionary[word])
            else:
                seq_index.extend(dictionary[UNK])
                
        if len(seq_index) > DEFINES.max_sequence_length:
            seq_index = seq_index[:DEFINES.max_sequence_length]
            
        seq_len.append(len(seq_index))
        
        seq_index += (DEFINES.max_sequence_length - len(seq_index)) * [dictionary[PAD]]
        
        seq_input_index.append(seq_index)
        
    return np.asarray(seq_input_index), seq_len

In [1]:
def dec_input_processing(value, dictionary):
    seq_input_index = []
    seq_len = []
    value = prepro_noise_canceling(value)
    
    for seq in value:
        seq_index = []
        
        for word in seq.split():
            seq_index = [dictionary[STD]]
            if dictionary.get(word) is not None:
                seq_index.extend(dictionary[word])
            else:
                seq_index.extend(UNK)
                
        if len(seq_index) > DEFINES.max_sequence_length:
            seq_index = seq_index[:DEFINES.max_sequence_length]
            
        seq_len.append(len(seq_index))
        
        seq_index += (DEFINES.max_sequence_length - len(seq_index)) * [dictionary[PAD]]
        
        seq_input_index.append(seq_index)
    
    return np.asarray(seq_input_index), seq_len

In [1]:
def dec_target_processing(value, dictionary):
    seq_input_index = []
    seq_len = []
    value = prepro_noise_canceling(value)
    
    for seq in value:
        seq_index = []
        
        seq_index = [dictionary[word] for word in seq.split()]
        seq_index = seq_index[:DEFINES.max_sequence_length-1] + [dictionary[END]]
            
        seq_len.append(len(seq_index))
        
        seq_index += (DEFINES.max_sequence_length - len(seq_index)) * [dictionary[PAD]]
        
        seq_input_index.append(seq_index)
   
    return np.asarray(seq_input_index)

In [2]:
def load_voc():
    voc_list = []
    if (not (os.path.exists(DEFINES.vocabulary_path))):
        
        data_df = pd.read_csv("./data_in/ChatBotData.csv")
        question, answer = data_df["Q"], data_df["A"]
        question = prepro_like_morphlized(question)

        data = []
        data.extend(question)
        data.extend(answer)

        words = tokenizing_data(data)
        
        words = list(set(words))
        
        words[:0] = MARKER
        
        with open(DEFINES.vocabulary_path, 'w', encoding='utf-8') as voc_file:
            for word in words:
                voc_file.write(word + "\n")
           
    with open(DEFINES.vocabulary_path, 'r', encoding='utf-8') as voc_file:
        for line in voc_file:
            voc_list.append(line.strip())

    char2idx, idx2char = make_voc(voc_list)

    return char2idx, idx2char, len(char2idx)

In [1]:
def make_voc(voc_list):
    word_to_idx = {word: idx for idx, word in enumerate(voc_list)}
    idx_to_word = {idx: word for idx, word in enumerate(voc_list)}
    return word_to_idx, idx_to_word