## 1. Module Import

In [1]:
# self defined moduels
from myModules.utils import DataLoader
from myModules.preprocess.korean import cleaning, remove_stopword, tagging, tokenizing, to_pickle


# General Module
import pandas as pd
import warnings
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# Read File
import glob

## 2. Data Load

In [19]:
DATA_ROOT = './Data/3구간/'

PERIOD_1 = DATA_ROOT + '1시기/1시기_TT/'
PERIOD_2 = DATA_ROOT + '2시기/2시기_TT/'
PERIOD_3 = DATA_ROOT + '3시기/3시기_TT/'

RESULT_ROOT = './processed-data/'

RESULT_1 = RESULT_ROOT + '/period-1/TT/'
RESULT_2 = RESULT_ROOT + '/period-2/TT/'
RESULT_3 = RESULT_ROOT + '/period-3/TT/'

In [3]:
files_1 = glob.glob(PERIOD_1+'*.txt')
files_2 = glob.glob(PERIOD_2+'*.txt')
files_3 = glob.glob(PERIOD_3+'*.txt')

texts_1 = DataLoader(files_1, mode='TT')
texts_2 = DataLoader(files_2, mode='TT')
texts_3 = DataLoader(files_3, mode='TT')

## 3. PreProcess

### 3-1. Data Cleaning

In [4]:
cleaned_1 = cleaning(texts_1)
cleaned_2 = cleaning(texts_2)
cleaned_3 = cleaning(texts_3)

### 3-2. Tagging

In [5]:
tagged_1 = tagging(cleaned_1)
tagged_2 = tagging(cleaned_2)
tagged_3 = tagging(cleaned_3)

POS tagging:   0%|          | 0/10 [00:00<?, ?it/s]

POS tagging:   0%|          | 0/10 [00:00<?, ?it/s]

POS tagging:   0%|          | 0/11 [00:00<?, ?it/s]

### 3-3. Remove Stopword

In [6]:
tagList = pd.read_pickle('./processed-data/pos-table.pkl')

In [7]:
stop_tag_list = ['IC', 'JC', 'JK', 'JKC', 'JKG', 'JKI', 'JKM', 'JKO', 'JKQ', 'JKS', 'JX', 'EPH', \
    'EPT', 'EPP', 'EFN', 'EFQ', 'EFO', 'EFA', 'EFI', 'EFR', 'ECE', 'ECD', 'ECS', 'ETN', 'ETD',
    'XSN', 'XSV', 'XSA', 'UN', 'OH', 'OL', 'ON', 'XPN', 'XPV', 'XR']

Kor_stopwords = ''

with open("./Data/Kor_stopwords.txt", 'r', encoding='utf-8') as f:
    stopword = f.read()
    Kor_stopwords = stopword

Kor_stopwords = Kor_stopwords.split('\n')

In [8]:
wo_stopword_1 = remove_stopword(tagged_1, stop_tag_list, Kor_stopwords)
wo_stopword_2 = remove_stopword(tagged_2, stop_tag_list, Kor_stopwords)
wo_stopword_3 = remove_stopword(tagged_3, stop_tag_list, Kor_stopwords)

Removing Stop Words:   0%|          | 0/10 [00:00<?, ?it/s]

Removing Stop Words:   0%|          | 0/10 [00:00<?, ?it/s]

Removing Stop Words:   0%|          | 0/11 [00:00<?, ?it/s]

### 3-4. Tokenize

#### All pos

In [9]:
all_1 = tokenizing(wo_stopword_1, tagList, 'all')
all_2 = tokenizing(wo_stopword_2, tagList, 'all')
all_3 = tokenizing(wo_stopword_3, tagList, 'all')

#### Noun

In [10]:
noun_1 = tokenizing(wo_stopword_1, tagList, 'noun')
noun_2 = tokenizing(wo_stopword_2, tagList, 'noun')
noun_3 = tokenizing(wo_stopword_3, tagList, 'noun')

#### Verb

In [11]:
verb_1 = tokenizing(wo_stopword_1, tagList, 'verb')
verb_2 = tokenizing(wo_stopword_2, tagList, 'verb')
verb_3 = tokenizing(wo_stopword_3, tagList, 'verb')

#### Adjective

In [12]:
adjective_1 = tokenizing(wo_stopword_1, tagList, 'adjective')
adjective_2 = tokenizing(wo_stopword_2, tagList, 'adjective')
adjective_3 = tokenizing(wo_stopword_3, tagList, 'adjective')

#### Adverb

In [13]:
adverb_1 = tokenizing(wo_stopword_1, tagList, 'adverb')
adverb_2 = tokenizing(wo_stopword_2, tagList, 'adverb')
adverb_3 = tokenizing(wo_stopword_3, tagList, 'adverb')

## 4. Save PreProcessed Data

#### All pos

In [20]:
to_pickle(all_1, file_name='all', root=RESULT_1)
to_pickle(all_2, file_name='all', root=RESULT_2)
to_pickle(all_3, file_name='all', root=RESULT_3)

#### Noun

In [21]:
to_pickle(noun_1, file_name='noun', root=RESULT_1)
to_pickle(noun_2, file_name='noun', root=RESULT_2)
to_pickle(noun_3, file_name='noun', root=RESULT_3)

#### Verb

In [22]:
to_pickle(verb_1, file_name='verb', root=RESULT_1)
to_pickle(verb_2, file_name='verb', root=RESULT_2)
to_pickle(verb_3, file_name='verb', root=RESULT_3)

#### Adjective

In [23]:
to_pickle(adjective_1, file_name='adjective', root=RESULT_1)
to_pickle(adjective_2, file_name='adjective', root=RESULT_2)
to_pickle(adjective_3, file_name='adjective', root=RESULT_3)

#### Adverb

In [24]:
to_pickle(adverb_1, file_name='adverb', root=RESULT_1)
to_pickle(adverb_2, file_name='adverb', root=RESULT_2)
to_pickle(adverb_3, file_name='adverb', root=RESULT_3)