# English Data PreProcessing

## 1. Module Import

In [53]:
# self defined Modules
from myModules.utils import merge
from myModules.preprocess.english import cleaning, remove_stopwords, tagging, dot_and_apostrophe, lemmatization_spacy, to_pickle, check_pos, pos_correction

# General Modules
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# NLP
import nltk
from nltk.tokenize import word_tokenize

## 2. Data Loader

In [54]:
DATA_ROOT = './Data/조선사진첩(1925) 영문 텍스트.txt'

RESULT_ROOT = './Result/chosun/'

with open(DATA_ROOT, 'r', encoding='UTF-16') as f:
    text = f.read()

text = [text]

## 3. PreProcess

### 3-1. Data Cleaning

- `dot(.)`과 `apostrophe(')`는 제거하지 않음

In [55]:
cleaned = cleaning(data=text)

### 3-2. Tokenizing

In [56]:
tokenized = [word_tokenize(text) for text in cleaned]

In [57]:
symbol = dot_and_apostrophe(tokenized)

##### apostrophe와 dot을 가진 token들 시각화

In [58]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"'s", "girls'higher", "'"}
dot을 가진 token : 
{'.'}


##### exception 목록 설정

In [59]:
apostrophe_exception = ["'s"]
dot_exception = []

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [60]:
symbol.print_exception()

apostrophe exceptions : 
["'s"]
dot exceptions : 
[]


##### apostrophe 처리

In [61]:
tokenized_ = symbol.remove_apostrophe(data=tokenized)

Processed Tokens : 
{"girls'higher", "'"}


##### dot 처리

In [62]:
tokenized__ = symbol.remove_dot(data=tokenized_)

Processed Tokens : 
{'', '.'}


##### 제거해야할 token 검사

In [63]:
symbol.check_invalid_tokens(data=tokenized__)

Remaining invalid Symbol : {'', 'a'}


##### 길이가 1이거나 필요없는 특수문자인 Token들 삭제

In [64]:
tokenized___ = symbol.remove_invalid_tokens(data=tokenized__)

Removed Tokens : 
{'', 'a'}


##### 남아있는 invalid한 token이 있는지 검사

In [65]:
symbol.check_invalid_tokens(data=tokenized___)

There is no invalid symbol


### 3-3. Remove StopWords

In [66]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['would', 'could', 'might', 'need', 'can', 'must', \
    'one', 'two', 'upon', 'may', 'perhaps', 'living', 'seem', 'also', 'ii', 'ofthe',
    'also', 'much', 'therefore', "'ll", "'ve", "n't"]

wo_stopword = remove_stopwords(tokenized___, stopwords, new_stopwords)

### 3-4. Tagging

In [67]:
pos_table = pd.read_pickle("processed-data/pos-table.pkl")

In [68]:
tagged = tagging(wo_stopword)

  0%|          | 0/1 [00:00<?, ?it/s]

In [69]:
pos = check_pos(tagged)

In [70]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{}


In [71]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{}


### 3-5. Change Pos to correct Pos

In [72]:
pos_correction_dict = {'keijo' : 'NN', 'temple' : 'NN', 'japan' : 'NN', 'heian' : 'NN', \
    'cluster' : 'NN', 'bogundai' : 'NN', 'kongo' : 'NN', 'view' : 'NN'}

In [73]:
corrected_tagged = pos_correction(tagged, pos_correction_dict)

### 3-6. Lemmatization

#### All pos

In [74]:
lemmatizer = lemmatization_spacy(data=corrected_tagged, pos_table=pos_table)
lemma_dict = {'found' : 'find'}
lemmatized_all = lemmatizer.lemmatize(lemma_dict=lemma_dict)

#### Nouns

In [75]:
lemmatizer = lemmatization_spacy(data=corrected_tagged, pos_table=pos_table, allowed_pos=['noun'])
lemma_dict = {'found' : 'find'}
lemmatized_noun = lemmatizer.lemmatize(lemma_dict=lemma_dict)

#### Verbs

In [76]:
lemmatizer = lemmatization_spacy(data=corrected_tagged, pos_table=pos_table, allowed_pos=['verb'])
lemma_dict = {'found' : 'find'}
lemmatized_verb = lemmatizer.lemmatize(lemma_dict=lemma_dict)

#### Adjectives

In [77]:
lemmatizer = lemmatization_spacy(data=corrected_tagged, pos_table=pos_table, allowed_pos=['adjective'])
lemma_dict = {'found' : 'find'}
lemmatized_adjective = lemmatizer.lemmatize(lemma_dict=lemma_dict)

#### Adverbs

In [78]:
lemmatizer = lemmatization_spacy(data=corrected_tagged, pos_table=pos_table, allowed_pos=['adverb'])
lemma_dict = {'found' : 'find'}
lemmatized_adverb = lemmatizer.lemmatize(lemma_dict=lemma_dict)

## 4. Save PreProcessed Data

In [79]:
SAVE_ROOT = './processed-data/chosun/'

### Lemmatized data to pickle file

#### all pos

In [80]:
to_pickle(data=lemmatized_all, file_name="lemmatized-all", root=SAVE_ROOT)

#### noun

In [81]:
to_pickle(data=lemmatized_noun, file_name="lemmatized-noun", root=SAVE_ROOT)

#### verb

In [82]:
to_pickle(data=lemmatized_verb, file_name="lemmatized-verb", root=SAVE_ROOT)

#### adjective

In [83]:
to_pickle(data=lemmatized_adjective, file_name="lemmatized-adjective", root=SAVE_ROOT)

#### adverb

In [84]:
to_pickle(data=lemmatized_adverb, file_name="lemmatized-adverb", root=SAVE_ROOT)

### Tagged data to pickle file

In [85]:
to_pickle(data=tagged, file_name="tagged", root=SAVE_ROOT)