# English Data PreProcessing

## 1. Module Import

In [1]:
# self defined Modules
from myModules.utils import merge
from myModules.preprocess.english import cleaning, remove_stopwords, tagging, dot_and_apostrophe, convert_pos, lemmatization, to_pickle, check_pos

# General Modules
import pandas as pd
import warnings
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

## 2. Data Loader

In [2]:
DATA_ROOT = './Data/조선사진첩(1925) 영문 텍스트.txt'

RESULT_ROOT = './Result/chosun/'

with open(DATA_ROOT, 'r', encoding='UTF-16') as f:
    text = f.read()

text = [text]

## 3. PreProcess

### 3-1. Data Cleaning

- `dot(.)`과 `apostrophe(')`는 제거하지 않음

In [3]:
cleaned = cleaning(data=text)

### 3-2. Tokenizing

In [4]:
tokenized = [word_tokenize(text) for text in cleaned]

In [5]:
symbol = dot_and_apostrophe(tokenized)

##### apostrophe와 dot을 가진 token들 시각화

In [6]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"'", "'s", "girls'higher"}
dot을 가진 token : 
{'.'}


##### exception 목록 설정

In [7]:
apostrophe_exception = ["'s"]
dot_exception = []

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [8]:
symbol.print_exception()

apostrophe exceptions : 
["'s"]
dot exceptions : 
[]


##### apostrophe 처리

In [9]:
tokenized_ = symbol.remove_apostrophe(data=tokenized)

Processed Tokens : 
{"'", "girls'higher"}


##### dot 처리

In [10]:
tokenized__ = symbol.remove_dot(data=tokenized_)

Processed Tokens : 
{'', '.'}


##### 제거해야할 token 검사

In [11]:
symbol.check_invalid_tokens(data=tokenized__)

Remaining invalid Symbol : {'', 'a'}


##### 길이가 1이거나 필요없는 특수문자인 Token들 삭제

In [12]:
tokenized___ = symbol.remove_invalid_tokens(data=tokenized__)

Removed Tokens : 
{'', 'a'}


##### 남아있는 invalid한 token이 있는지 검사

In [13]:
symbol.check_invalid_tokens(data=tokenized___)

There is no invalid symbol


### 3-3. Remove StopWords

In [14]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['would', 'could', 'might', 'need', 'can', 'must', \
    'one', 'two', 'upon', 'may', 'perhaps', 'living', 'seem', 'also', 'ii', 'ofthe',
    'also', 'much', 'therefore', "'ll", "'ve", "n't"]

wo_stopword = remove_stopwords(tokenized___, stopwords, new_stopwords)

### 3-4. Tagging

In [15]:
pos_table = pd.read_pickle("processed-data/pos-table.pkl")

In [17]:
tagged = tagging(wo_stopword)

  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
pos = check_pos(tagged)

In [19]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{}


In [20]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{}


### 3-6. Lemmatization

In [21]:
lemmatizer = WordNetLemmatizer()

#### All pos

In [22]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_all = lemmatize.lemmatize()

#### Nouns

In [23]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_noun = lemmatize.lemmatize()

#### Verbs

In [24]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_verb = lemmatize.lemmatize()

#### Adjectives

In [25]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_adjective = lemmatize.lemmatize()

#### Adverbs

In [26]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_adverb = lemmatize.lemmatize()

## 4. Save PreProcessed Data

In [27]:
SAVE_ROOT = './processed-data/chosun/'

### Lemmatized data to pickle file

#### all pos

In [28]:
to_pickle(data=lemmatized_all, file_name="lemmatized-all", root=SAVE_ROOT)

#### noun

In [29]:
to_pickle(data=lemmatized_noun, file_name="lemmatized-noun", root=SAVE_ROOT)

#### verb

In [30]:
to_pickle(data=lemmatized_verb, file_name="lemmatized-verb", root=SAVE_ROOT)

#### adjective

In [31]:
to_pickle(data=lemmatized_adjective, file_name="lemmatized-adjective", root=SAVE_ROOT)

#### adverb

In [32]:
to_pickle(data=lemmatized_adverb, file_name="lemmatized-adverb", root=SAVE_ROOT)

### Tagged data to pickle file

In [33]:
to_pickle(data=tagged, file_name="tagged", root=SAVE_ROOT)