# English Data PreProcessing

## 1. Module Import

In [1]:
# self defined Modules
from myModules.utils.data.DataLoader import DataLoader
from myModules.utils.merge.mergeOverPeriod import merge
from myModules.preprocess import cleaning, removeStopWords_ST, tagging

# General Modules
import pandas as pd
import numpy as np
import warnings
from tqdm.notebook import tqdm
import pickle
import re
import glob

warnings.filterwarnings('ignore')

# Read File
import glob

# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2. Data Loader

In [21]:
DATA_ROOT = './Data/조선사진첩(1925) 영문 텍스트.txt'

RESULT_ROOT = './Result/chosun/'

with open(DATA_ROOT, 'r', encoding='UTF-16') as f:
    text = f.read()

text = [text]

## 3. PreProcess

### 3-1. Data Cleaning

- `dot(.)`과 `apostrophe(')`는 제거하지 않음

In [22]:
cleaned = cleaning(texts=text, mode='ST')

### 3-2. Tokenizing

In [24]:
class dot_and_apostrophe:
    def __init__(self, data):
        self.data = data
    
    def token_with_apostrophe(self):
        apostrophe = []

        for tokens in self.data:
            for token in tokens:
                if "'" in token : apostrophe.append(token)
        
        self.apostrophes = set(apostrophe)

        print(f"apostrophe를 가진 token : \n{self.apostrophes}")
    
    def token_with_dot(self):
        dot = []

        for tokens in self.data:
            for token in tokens:
                if "." in token : dot.append(token)
        
        self.dots = set(dot)

        print(f"dot을 가진 token : \n{self.dots}")
        
    def set_exception(self, apostrophe_exception, dot_exception):
        self.apostrophe_exception = apostrophe_exception
        self.dot_exception = dot_exception
    
    def print_exception(self):
        print(f"apostrophe exceptions : \n{self.apostrophe_exception}")
        print(f"dot exceptions : \n{self.dot_exception}")
    
    def remove_apostrophe(self, data):
        result = []
        processed = []

        for tokens in data:
            arr = []
            for token in tokens:
                if token not in self.apostrophe_exception:
                    if not token.isalnum() : 
                        if "." not in token : processed.append(token)
                    # dot은 삭제하지 않음. -> 예외처리하면서 삭제해야함
                    arr.append(re.sub("[^a-z\.]", "", token))
                else : arr.append(token)
            result.append(arr)
        
        processed = set(processed)

        print(f"Processed Tokens : \n{processed}")
        
        return result
    
    def remove_dot(self, data):
        result = []
        processed = []

        for tokens in data:
            arr = []
            for token in tokens:
                if token not in self.dot_exception:
                    if not token.isalnum() : 
                        if "'" not in token : processed.append(token)
                    # apostrophe는 삭제하지 않음. -> 예외처리하면서 삭제
                    arr.append(re.sub("[^a-z']", "", token))
                else : arr.append(token)
            result.append(arr)
        
        processed = set(processed)

        print(f"Processed Tokens : \n{processed}")
        
        return result
    
    def check_invalid_tokens(self, data):
        # 예외처리한 Token들 외에 특수문자를 가진 Token들이 있는지 확인
        invalid_tokens = []

        for tokens in data:
            for token in tokens:
                if not token.isalnum() : invalid_tokens.append(token)
                elif len(token) == 1 : invalid_tokens.append(token)
        
        invalid_tokens = set(invalid_tokens)
        exception = set(self.apostrophe_exception).union(set(self.dot_exception))
        self.invalid_symbol = invalid_tokens.difference(exception)

        if len(self.invalid_symbol) == 0:
            print("There is no invalid symbol")
        else :
            print(f"Remaining invalid Symbol : {self.invalid_symbol}")
    
    def remove_invalid_tokens(self, data):
        # 남아있는 특수문자 + 길이가 1인 token들을 삭제
        
        result = []
        removed = []

        for tokens in data:
            arr = []
            for token in tokens:
                if len(token) == 1 : removed.append(token)
                elif token in self.invalid_symbol : removed.append(token)
                else : arr.append(token)
            result.append(arr)

        removed = set(removed)
        
        print(f"Removed Tokens : \n{removed}")

        return result

In [27]:
tokenized = [word_tokenize(text) for text in cleaned]

In [29]:
symbol = dot_and_apostrophe(tokenized)

##### apostrophe와 dot을 가진 token들 시각화

In [30]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"girls'higher", "'s", "'"}
dot을 가진 token : 
{'.'}


##### exception 목록 설정

In [31]:
apostrophe_exception = ["'s"]
dot_exception = []

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [32]:
symbol.print_exception()

apostrophe exceptions : 
["'s"]
dot exceptions : 
[]


##### apostrophe 처리

In [33]:
tokenized_ = symbol.remove_apostrophe(data=tokenized)

Processed Tokens : 
{"girls'higher", "'"}


##### dot 처리

In [35]:
tokenized__ = symbol.remove_dot(data=tokenized_)

Processed Tokens : 
{'', '.'}


##### 제거해야할 token 검사

In [36]:
symbol.check_invalid_tokens(data=tokenized__)

Remaining invalid Symbol : {'', 'a'}


##### 길이가 1이거나 필요없는 특수문자인 Token들 삭제

In [39]:
tokenized___ = symbol.remove_invalid_tokens(data=tokenized__)

Removed Tokens : 
{'', 'a'}


##### 남아있는 invalid한 token이 있는지 검사

In [40]:
symbol.check_invalid_tokens(data=tokenized___)

There is no invalid symbol


### 3-3. Remove StopWords

In [42]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['would', 'could', 'might', 'need', 'can', 'must', \
    'one', 'two', 'upon', 'may', 'perhaps', 'living', 'seem', 'also', 'ii', 'ofthe',
    'also', 'much', 'therefore', "'ll", "'ve", "n't"]

wo_stopword = removeStopWords_ST(tokenized___, stopwords, new_stopwords)

### 3-4. Tagging

In [44]:
pos_table = pd.read_pickle("processed-data/pos-table.pkl")

In [47]:
tagged = tagging(wo_stopword, mode='ST')

  0%|          | 0/1 [00:00<?, ?it/s]

In [48]:
class check_pos:
    def __init__(self, data):
        self.data = data
        self.dots = {}
        self.apostrophes = {}
        self.dots_wo = {}
        self.apostrophes_wo = {}

        arr_dot = []
        arr_apostrophe = []

        for tags in self.data:
            for tag in tags:
                if "." in tag[0] : arr_dot.append(tag[0])
                elif "'" in tag[0] : arr_apostrophe.append(tag[0])
        
        for dot in set(arr_dot):
            self.dots[dot] = set([tag[1] for tag in merge(self.data) if tag[0] == dot])
        
        for apos in set(arr_apostrophe):
            self.apostrophes[apos] = set([tag[1] for tag in merge(self.data) if tag[0] == apos])
        
        for dot in set(arr_dot):
            removed = nltk.pos_tag([re.sub("[^a-z]", "", dot)])
            self.dots_wo[removed[0][0]] = [removed[0][1]]
        
        for apos in set(arr_apostrophe):
            removed = nltk.pos_tag([re.sub("[^a-z]", "", apos)])
            self.apostrophes_wo[removed[0][0]] = [removed[0][1]]
        
    
    def pos_with_symbol(self):
        print(f"tagged token with apostrophe : \n{self.apostrophes}")
        print(f"tagged token with dot : \n{self.dots}") 

    def pos_without_symbol(self):
        print(f"tagged token without apostrophe : \n{self.apostrophes_wo}")
        print(f"tagged token without dot : \n{self.dots_wo}") 

In [49]:
pos = check_pos(tagged)

In [50]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{}


In [51]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{}


### 3-6. Lemmatization

In [52]:
class lemmatization:
    def __init__(self, data, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb']):
        self.data = data
        self.lemmatizer = lemmatizer
        self.allowed_pos = []
        for pos in allowed_pos:
            if pos == 'noun' : self.allowed_pos.extend(pos_table.Eng_tag[0])
            elif pos == 'verb' : self.allowed_pos.extend(pos_table.Eng_tag[2])
            elif pos == 'adjective' : self.allowed_pos.extend(pos_table.Eng_tag[3])
            elif pos == 'adverb' : self.allowed_pos.extend(pos_table.Eng_tag[4])

    def lemmatize(self):
        result = []

        for tags in self.data:
            arr = []
            for token, pos in tags:
                if pos in self.allowed_pos :
                    pos_info = pos[0].lower()
                    if pos_info == 'j' : pos_info = 'a'
                    elif pos_info =='w' : pos_info = 'r'
                    try : arr.append(self.lemmatizer.lemmatize(token, pos_info))
                    except : print(token, pos, pos_info)
            result.append(arr)
        
        return result

In [53]:
lemmatizer = WordNetLemmatizer()

#### All pos

In [55]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_all = lemmatize.lemmatize()

#### Nouns

In [56]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_noun = lemmatize.lemmatize()

#### Verbs

In [57]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_verb = lemmatize.lemmatize()

#### Adjectives

In [58]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_adjective = lemmatize.lemmatize()

#### Adverbs

In [59]:
lemmatize = lemmatization(tagged, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_adverb = lemmatize.lemmatize()

## 4. Save PreProcessed Data

In [62]:
SAVE_ROOT = './processed-data/chosun/'

def to_pickle(data, file_name, root='./'):
    with open(root+file_name+'.pkl', 'wb') as f:
            pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

### Lemmatized data to pickle file

#### all pos

In [63]:
to_pickle(data=lemmatized_all, file_name="lemmatized-all", root=SAVE_ROOT)

#### noun

In [64]:
to_pickle(data=lemmatized_noun, file_name="lemmatized-noun", root=SAVE_ROOT)

#### verb

In [65]:
to_pickle(data=lemmatized_verb, file_name="lemmatized-verb", root=SAVE_ROOT)

#### adjective

In [66]:
to_pickle(data=lemmatized_adjective, file_name="lemmatized-adjective", root=SAVE_ROOT)

#### adverb

In [67]:
to_pickle(data=lemmatized_adverb, file_name="lemmatized-adverb", root=SAVE_ROOT)

### Tagged data to pickle file

In [68]:
to_pickle(data=tagged, file_name="tagged", root=SAVE_ROOT)