# English Data PreProcessing

## [주요 고려 사항]
1. `dot(.)`과 `apostrophe(')` 처리
    - 'u.s.'와 'u.s.s.r.'과 같은 약자처리를 어떻게 할 것인가?
    - 'america's'와 같은 소유격을 어떻게 처리할 것인가?
        1. 처음 Cleaning 때, `dot(.)`과 `apostrophe(')`는 제거하지 않음
            - `dot(.)`
                - 'u.s', 'u.s.s.r'과 같은 약자를 유지시키기 위한 처리
            - `apostrophe(')`
                - 'america's'와 같은 소유격을 유지시켜서 Tokenizing때 's를 분리시키기 위함.
        2. Tokenizing 이후, `dot(.)`과 `apostrophe(')`를 유지시켜야 하는 Token들 외에는 특수문자 제거
            1. `apostrophe(')`와 `dot(.)`을 가진 Token들을 출력해보고 유지시킬 Token들의 목록을 결정
            2. `apostrophe(')`를 유지시킬 Token들 외의 모든 Token들에서 `apostrophe(')` 및 특수문자 제거
                - `dot(.)`은 다음 단계에서 예외처리를 하며 제거해야 하므로, 이 단계에서는 모든 `dot(.)`을 유지시킴
            3. `dot(.)`을 유지시킬 Token들 외의 모든 Token들에서 `dot(.)` 및 특수문자 제거

## Module Import

In [1]:
# self defined Modules
from myModules.utils.data.DataLoader import DataLoader
from myModules.utils.merge.mergeOverPeriod import merge
from myModules.preprocess import cleaning, removeStopWords_ST, tagging, extract_some_pos_ST

# General Modules
import pandas as pd
import numpy as np
import warnings
from tqdm.notebook import tqdm
import pickle
import re
import glob

warnings.filterwarnings('ignore')

# Read File
import glob

# NLP
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Data Loader

In [2]:
DATA_ROOT = './Data/3구간/'

PERIOD_1 = DATA_ROOT + '1시기/1시기_ST/'
PERIOD_2 = DATA_ROOT + '2시기/2시기_ST/'
PERIOD_3 = DATA_ROOT + '3시기/3시기_ST/'

RESULT_ROOT = './Result/3구간/'

RESULT_1 = RESULT_ROOT + '/1시기/ST/'
RESULT_2 = RESULT_ROOT + '/2시기/ST/'
RESULT_3 = RESULT_ROOT + '/3시기/ST/'

In [3]:
files_1 = glob.glob(PERIOD_1+'*.txt')
files_2 = glob.glob(PERIOD_2+'*.txt')
files_3 = glob.glob(PERIOD_3+'*.txt')

texts_1 = DataLoader(files_1, mode='ST')
texts_2 = DataLoader(files_2, mode='ST')
texts_3 = DataLoader(files_3, mode='ST')

## PreProcess

### 3-1. Data Cleaning

- `dot(.)`과 `apostrophe(')`는 제거하지 않음

In [4]:
cleaned_1 = cleaning(texts=texts_1, mode='ST')
cleaned_2 = cleaning(texts=texts_2, mode='ST')
cleaned_3 = cleaning(texts=texts_3, mode='ST')

### 3-2. Tokenizing

In [5]:
class dot_and_apostrophe:
    def __init__(self, data):
        self.data = data
    
    def token_with_apostrophe(self):
        apostrophe = []

        for tokens in self.data:
            for token in tokens:
                if "'" in token : apostrophe.append(token)
        
        self.apostrophes = set(apostrophe)

        print(f"apostrophe를 가진 token : \n{self.apostrophes}")
    
    def token_with_dot(self):
        dot = []

        for tokens in self.data:
            for token in tokens:
                if "." in token : dot.append(token)
        
        self.dots = set(dot)

        print(f"dot을 가진 token : \n{self.dots}")
        
    def set_exception(self, apostrophe_exception, dot_exception):
        self.apostrophe_exception = apostrophe_exception
        self.dot_exception = dot_exception
    
    def print_exception(self):
        print(f"apostrophe exceptions : \n{self.apostrophe_exception}")
        print(f"dot exceptions : \n{self.dot_exception}")
    
    def remove_apostrophe(self, data):
        result = []
        processed = []

        for tokens in data:
            arr = []
            for token in tokens:
                if token not in self.apostrophe_exception:
                    if not token.isalnum() : 
                        if "." not in token : processed.append(token)
                    # dot은 삭제하지 않음. -> 예외처리하면서 삭제해야함
                    arr.append(re.sub("[^a-z\.]", "", token))
                else : arr.append(token)
            result.append(arr)
        
        processed = set(processed)

        print(f"Processed Tokens : \n{processed}")
        
        return result
    
    def remove_dot(self, data):
        result = []
        processed = []

        for tokens in data:
            arr = []
            for token in tokens:
                if token not in self.dot_exception:
                    if not token.isalnum() : 
                        if "'" not in token : processed.append(token)
                    # apostrophe는 삭제하지 않음. -> 예외처리하면서 삭제
                    arr.append(re.sub("[^a-z']", "", token))
                else : arr.append(token)
            result.append(arr)
        
        processed = set(processed)

        print(f"Processed Tokens : \n{processed}")
        
        return result
    
    def check_invalid_tokens(self, data):
        # 예외처리한 Token들 외에 특수문자를 가진 Token들이 있는지 확인
        invalid_tokens = []

        for tokens in data:
            for token in tokens:
                if not token.isalnum() : invalid_tokens.append(token)
                elif len(token) == 1 : invalid_tokens.append(token)
        
        invalid_tokens = set(invalid_tokens)
        exception = set(self.apostrophe_exception).union(set(self.dot_exception))
        self.invalid_symbol = invalid_tokens.difference(exception)

        if len(self.invalid_symbol) == 0:
            print("There is no invalid symbol")
        else :
            print(f"Remaining invalid Symbol : {self.invalid_symbol}")
    
    def remove_invalid_tokens(self, data):
        # 남아있는 특수문자 + 길이가 1인 token들을 삭제
        
        result = []
        removed = []

        for tokens in data:
            arr = []
            for token in tokens:
                if len(token) == 1 : removed.append(token)
                elif token in self.invalid_symbol : removed.append(token)
                else : arr.append(token)
            result.append(arr)

        removed = set(removed)
        
        print(f"Removed Tokens : \n{removed}")

        return result

In [6]:
tokenized_1 = [word_tokenize(text) for text in cleaned_1]
tokenized_2 = [word_tokenize(text) for text in cleaned_2]
tokenized_3 = [word_tokenize(text) for text in cleaned_3]

#### Period 1

In [7]:
symbol = dot_and_apostrophe(tokenized_1)

##### apostrophe와 dot을 가진 token들 시각화

In [8]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"'structure", "'liberty", "n't", "'german", "'d", "'ve", "'ll", "'heat", "'s", "'blamed", "'democracy", "'mvd", "o'clock", "'m", "'", "'into", "'madam", "'system", "'are"}
dot을 가진 token : 
{'p.m.', 'col.', 'camps.if', 'u.s.s.r.', 'e.', 'w.', '...', 'gen.', 'u.s.', '..', 'u.', 'ph.d.', 'frightened.to', 'jr.', 'f.', 's.', '.', 'u.n.', 'i.', 'n.', 't.', 'a.', 'mrs.', 'a.m.', 'messrs.', 'v.', 'm.', 'oct.', 'dr.', 'mr.', 'co.', 'p.', 'st.'}


##### exception 목록 설정

In [9]:
apostrophe_exception = ["'ll", "'s", "'ve"]
dot_exception = ["u.s.s.r.", "dr.", "messrs.", "gen.", "u.n.", "a.m.", "st.", "u.s.", "ph.d", "jr.", "p.m.", "mrs.", "mr."]

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [10]:
symbol.print_exception()

apostrophe exceptions : 
["'ll", "'s", "'ve"]
dot exceptions : 
['u.s.s.r.', 'dr.', 'messrs.', 'gen.', 'u.n.', 'a.m.', 'st.', 'u.s.', 'ph.d', 'jr.', 'p.m.', 'mrs.', 'mr.']


##### apostrophe 처리

In [11]:
tokenized_1_ = symbol.remove_apostrophe(data=tokenized_1)

Processed Tokens : 
{"o'clock", "'german", "'into", "'d", "'mvd", "'madam", "'heat", "'system", "'blamed", "'m", "'democracy", "'structure", "'are", "'liberty", "'", "n't"}


##### dot 처리

In [12]:
tokenized_1__ = symbol.remove_dot(data=tokenized_1_)

Processed Tokens : 
{'', 'col.', 'camps.if', 'e.', 'w.', '...', '..', 'u.', 'ph.d.', 'frightened.to', 'f.', 's.', '.', 'i.', 'n.', 't.', 'a.', 'v.', 'm.', 'oct.', 'co.', 'p.'}


##### 제거해야할 token 검사

In [13]:
symbol.check_invalid_tokens(data=tokenized_1__)

Remaining invalid Symbol : {'', 'a', 'b', 'h', 'n', 'g', 'o', 't', 'y', 'v', 'e', 'r', 'p', 'u', 's', 'm', 'w', 'd', 'f', 'x', 'k', 'i', 'j'}


##### 길이가 1이거나 필요없는 특수문자인 Token들 삭제

In [14]:
tokenized_1___ = symbol.remove_invalid_tokens(data=tokenized_1__)

Removed Tokens : 
{'', 'a', 'b', 'h', 'n', 'g', 'o', 't', 'y', 'v', 'e', 'r', 'p', 'u', 's', 'm', 'w', 'd', 'f', 'x', 'k', 'i', 'j'}


##### 남아있는 invalid한 token이 있는지 검사

In [15]:
symbol.check_invalid_tokens(data=tokenized_1___)

There is no invalid symbol


#### Peiod 2

In [16]:
symbol = dot_and_apostrophe(tokenized_2)

##### apostrophe와 dot을 가진 token들 시각화

In [17]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"'s", "'m", "'reprisals", "'for", "'", "n't"}
dot을 가진 token : 
{'p.m.', 'u.s.s.r.', 'e.', 'w.', 'gen.', '..', 'u.s.a.', 'r.', 'g.', 's.', '.', 'i.', 'n.', 't.', 'a.', 'c.', 'b.', 'messrs.', 'v.', 'm.', 'l.', 'h.', 'tyranny.the', 'mr.', 'dr.', 'p.', 'o.'}


##### exception 목록 설정

In [18]:
apostrophe_exception = ["'s"]
dot_exception = ["u.s.s.r.", "dr.", "messrs.", "gen.", "u.s.a.", "p.m.", "mr."]

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [19]:
symbol.print_exception()

apostrophe exceptions : 
["'s"]
dot exceptions : 
['u.s.s.r.', 'dr.', 'messrs.', 'gen.', 'u.s.a.', 'p.m.', 'mr.']


##### apostrophe 처리

In [20]:
tokenized_2_ = symbol.remove_apostrophe(data=tokenized_2)

Processed Tokens : 
{"'m", "'reprisals", "'for", "'", "n't"}


##### dot 처리

In [21]:
tokenized_2__ = symbol.remove_dot(data=tokenized_2_)

Processed Tokens : 
{'', 'e.', 'w.', '..', 'r.', 'g.', 's.', '.', 'i.', 'n.', 't.', 'a.', 'c.', 'b.', 'v.', 'm.', 'l.', 'h.', 'tyranny.the', 'p.', 'o.'}


##### 제거해야할 Token들 검사

In [22]:
symbol.check_invalid_tokens(data=tokenized_2__)

Remaining invalid Symbol : {'', 'a', 'b', 'c', 'h', 'n', 'g', 'o', 't', 'v', 'e', 'r', 'p', 's', 'm', 'd', 'w', 'l', 'f', 'i'}


##### 길이가 1이거나 필요없는 특수문자인 token 제거

In [23]:
tokenized_2___ = symbol.remove_invalid_tokens(data=tokenized_2__)

Removed Tokens : 
{'', 'a', 'b', 'c', 'h', 'n', 'g', 'o', 't', 'v', 'e', 'r', 'p', 's', 'm', 'd', 'w', 'l', 'f', 'i'}


##### 남아있는 Invalid한 Token이 있는지 확인

In [24]:
symbol.check_invalid_tokens(data=tokenized_2___)

There is no invalid symbol


#### period 3

In [25]:
symbol = dot_and_apostrophe(tokenized_3)

##### apostrophe와 dot을 가진 token들 시각화

In [26]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"o'clock", "'d", "'ve", "'ll", "'vas", "'s", "'has", "'spontaneous", "'recession", "'", "n't"}
dot을 가진 token : 
{'p.m.', 'col.', 'u.n.r.r.a', '..................', 'u.s.s.r.', 'e.', 'w.', '...', 'gen.', 'u.', 'prof.', 'r.', 'jr.', 'g.', 'f.', 's.', '.', 'i.', 'n.', 't.', 'c.', 'a.', 'mrs.', 'a.m.', 'b.', 'v.', 'm.', 'maj.', 'l.', 'h.', 'dr.', 'mr.', 'p.', 's.s.r', 'st.', 'j.', 'o.', 'd.'}


##### exception 목록 설정

In [27]:
apostrophe_exception = ["'ll", "'s", "'ve"]
dot_exception = ["u.s.s.r.", "dr.", "s.s.r", "a.m.", "st.", "prof.", "u.n.r.r.a", "jr.", "maj.", "p.m.", "mrs.", "mr."]

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [28]:
symbol.print_exception()

apostrophe exceptions : 
["'ll", "'s", "'ve"]
dot exceptions : 
['u.s.s.r.', 'dr.', 's.s.r', 'a.m.', 'st.', 'prof.', 'u.n.r.r.a', 'jr.', 'maj.', 'p.m.', 'mrs.', 'mr.']


##### apostrophe 처리

In [29]:
tokenized_3_ = symbol.remove_apostrophe(tokenized_3)

Processed Tokens : 
{"o'clock", "'d", "'vas", '``', "'has", "'spontaneous", "'recession", "'", "n't"}


##### dot 처리

In [30]:
tokenized_3__ = symbol.remove_dot(tokenized_3_)

Processed Tokens : 
{'', 'col.', '..................', 'e.', 'w.', '...', 'gen.', 'u.', 'r.', 'g.', 'f.', 's.', '.', 'i.', 'n.', 't.', 'c.', 'a.', 'b.', 'v.', 'm.', 'l.', 'h.', 'p.', 'j.', 'o.', 'd.'}


##### 제거해야할 token 확인

In [31]:
symbol.check_invalid_tokens(tokenized_3__)

Remaining invalid Symbol : {'', 'a', 'b', 'c', 'h', 'n', 'g', 'o', 't', 'v', 'e', 'r', 'p', 'u', 's', 'm', 'w', 'd', 'l', 'f', 'x', 'i', 'j'}


##### 길이가 1이거나 필요없는 특수문자인 token 제거

In [32]:
tokenized_3___ = symbol.remove_invalid_tokens(tokenized_3__)

Removed Tokens : 
{'', 'a', 'b', 'c', 'h', 'n', 'g', 'o', 't', 'v', 'e', 'r', 'p', 'u', 's', 'm', 'd', 'w', 'l', 'f', 'x', 'i', 'j'}


##### 남아있는 INvalid한 token이 있는지 확인

In [33]:
symbol.check_invalid_tokens(tokenized_3___)

There is no invalid symbol


### 3-3. Remove StopWords

In [34]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['would', 'could', 'might', 'need', 'can', 'must', \
    'one', 'two', 'upon', 'may', 'perhaps', 'living', 'seem', 'also', 'ii', 'ofthe',
    'also', 'much', 'therefore']

wo_stopword_1 = removeStopWords_ST(tokenized_1___, stopwords, new_stopwords)
wo_stopword_2 = removeStopWords_ST(tokenized_2___, stopwords, new_stopwords)
wo_stopword_3 = removeStopWords_ST(tokenized_3___, stopwords, new_stopwords)

### 3-4. Tagging

In [35]:
pos_table = pd.read_pickle("processed-data/pos-table.pkl")

In [36]:
tagged_1 = tagging(wo_stopword_1, mode='ST')
tagged_2 = tagging(wo_stopword_2, mode='ST')
tagged_3 = tagging(wo_stopword_3, mode='ST')

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [37]:
class check_pos:
    def __init__(self, data):
        self.data = data
        self.dots = {}
        self.apostrophes = {}
        self.dots_wo = {}
        self.apostrophes_wo = {}

        arr_dot = []
        arr_apostrophe = []

        for tags in self.data:
            for tag in tags:
                if "." in tag[0] : arr_dot.append(tag[0])
                elif "'" in tag[0] : arr_apostrophe.append(tag[0])
        
        for dot in set(arr_dot):
            self.dots[dot] = set([tag[1] for tag in merge(self.data) if tag[0] == dot])
        
        for apos in set(arr_apostrophe):
            self.apostrophes[apos] = set([tag[1] for tag in merge(self.data) if tag[0] == apos])
        
        for dot in set(arr_dot):
            removed = nltk.pos_tag([re.sub("[^a-z]", "", dot)])
            self.dots_wo[removed[0][0]] = [removed[0][1]]
        
        for apos in set(arr_apostrophe):
            removed = nltk.pos_tag([re.sub("[^a-z]", "", apos)])
            self.apostrophes_wo[removed[0][0]] = [removed[0][1]]
        
    
    def pos_with_symbol(self):
        print(f"tagged token with apostrophe : \n{self.apostrophes}")
        print(f"tagged token with dot : \n{self.dots}") 

    def pos_without_symbol(self):
        print(f"tagged token without apostrophe : \n{self.apostrophes_wo}")
        print(f"tagged token without dot : \n{self.dots_wo}") 

#### Period 1

In [38]:
pos = check_pos(tagged_1)

In [39]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'ve": {'VBP'}, "'s": {'POS'}, "'ll": {'MD'}}
tagged token with dot : 
{'p.m.': {'RB'}, 'u.s.': {'JJ'}, 'u.n.': {'NN'}, 'dr.': {'JJ', 'VBP'}, 'mr.': {'NN', 'NNP', 'RBS', 'RB', 'VBP', 'JJ'}, 'u.s.s.r.': {'JJ', 'VBP'}, 'mrs.': {'NNS'}, 'st.': {'NN', 'JJ'}, 'a.m.': {'JJ'}, 'messrs.': {'NN'}, 'jr.': {'NN', 'VBP'}, 'gen.': {'NN', 'JJ', 'VBP'}}


In [40]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'ve': ['NN'], 's': ['NN'], 'll': ['NN']}
tagged token without dot : 
{'pm': ['NN'], 'us': ['PRP'], 'un': ['NN'], 'dr': ['NN'], 'mr': ['NN'], 'ussr': ['NN'], 'mrs': ['NN'], 'st': ['NN'], 'am': ['VBP'], 'messrs': ['NN'], 'jr': ['NN'], 'gen': ['NN']}


#### Period 2

In [41]:
pos = check_pos(tagged_2)

In [42]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{'p.m.': {'JJ'}, 'dr.': {'NN'}, 'u.s.a.': {'NN'}, 'mr.': {'NN', 'NNP', 'RBS', 'VBZ', 'FW', 'VBP', 'NNS', 'JJ', 'VB'}, 'u.s.s.r.': {'JJ'}, 'messrs.': {'NNS'}, 'gen.': {'JJ'}}


In [43]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{'pm': ['NN'], 'dr': ['NN'], 'usa': ['NN'], 'mr': ['NN'], 'ussr': ['NN'], 'messrs': ['NN'], 'gen': ['NN']}


#### Period 3

In [44]:
pos = check_pos(tagged_3)

In [45]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'ve": {'VBP'}, "'s": {'POS'}, "'ll": {'MD'}}
tagged token with dot : 
{'p.m.': {'NN', 'RB', 'VBP'}, 'maj.': {'NN'}, 'u.n.r.r.a': {'RB', 'JJ'}, 'dr.': {'NN', 'JJ', 'VBZ', 'VBP'}, 'mr.': {'NN', 'VBD', 'NNP', 'RBS', 'RBR', 'VBZ', 'FW', 'RB', 'VBP', 'NNS', 'JJ', 'VB'}, 'u.s.s.r.': {'JJ'}, 's.s.r': {'NN'}, 'mrs.': {'NN'}, 'st.': {'JJ'}, 'prof.': {'NN'}, 'a.m.': {'VBD', 'RB', 'NN'}, 'jr.': {'NN'}}


In [46]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'ve': ['NN'], 's': ['NN'], 'll': ['NN']}
tagged token without dot : 
{'pm': ['NN'], 'maj': ['NN'], 'unrra': ['NN'], 'dr': ['NN'], 'mr': ['NN'], 'ussr': ['NN'], 'ssr': ['NN'], 'mrs': ['NN'], 'st': ['NN'], 'prof': ['NN'], 'am': ['VBP'], 'jr': ['NN']}


### 3-5. adress POS of token with symbols

In [47]:
def convert_pos(data, target_pos="NN"):
    result = []

    for tags in data:
        arr = []
        for tag in tags:
            if "." in tag[0] : arr.append((tag[0], target_pos))
            else : arr.append(tag)
        result.append(arr)
    
    return result

In [48]:
tagged_1_ = convert_pos(data=tagged_1, target_pos="NN")
tagged_2_ = convert_pos(data=tagged_2, target_pos="NN")
tagged_3_ = convert_pos(data=tagged_3, target_pos="NN")

### 3-6. Lemmatization

In [49]:
class lemmatization:
    def __init__(self, data, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb']):
        self.data = data
        self.lemmatizer = lemmatizer
        self.allowed_pos = []
        for pos in allowed_pos:
            if pos == 'noun' : self.allowed_pos.extend(pos_table.Eng_tag[0])
            elif pos == 'verb' : self.allowed_pos.extend(pos_table.Eng_tag[2])
            elif pos == 'adjective' : self.allowed_pos.extend(pos_table.Eng_tag[3])
            elif pos == 'adverb' : self.allowed_pos.extend(pos_table.Eng_tag[4])
    
    def append(self, token, arr):
        if "." in token : arr.append(token)
        elif "'" in token : arr.append(token)
        elif token == "us" : arr.append(token)
        else : 
            arr.append(self.lemmatizer.lemmatize(token))
        
        return arr

    def lemmatize(self):
        result = []

        for tags in self.data:
            arr = []
            for token, pos in tags:
                if pos in self.allowed_pos : self.append(token, arr)
            result.append(arr)
        
        return result

In [50]:
lemmatizer = WordNetLemmatizer()

#### All pos

In [51]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_1_all = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_2_all = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_3_all = lemmatize.lemmatize()

#### Nouns

In [52]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_1_noun = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_2_noun = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_3_noun = lemmatize.lemmatize()

#### Verbs

In [53]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_1_verb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_2_verb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_3_verb = lemmatize.lemmatize()

#### Adjectives

In [54]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_1_adjective = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_2_adjective = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_3_adjective = lemmatize.lemmatize()

#### Adverbs

In [55]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_1_adverb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_2_adverb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_3_adverb = lemmatize.lemmatize()

## 4. Save PreProcessed Data

In [56]:
SAVE_ROOT = './processed-data/'

SAVE_1 = SAVE_ROOT + 'period-1/'
SAVE_2 = SAVE_ROOT + 'period-2/'
SAVE_3 = SAVE_ROOT + 'period-3/'

def to_pickle(data, file_name, root='./'):
    with open(root+file_name+'.pkl', 'wb') as f:
            pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

### Lemmatized data to pickle file

#### all pos

In [57]:
to_pickle(data=lemmatized_1_all, file_name="lemmatized-all", root=SAVE_1)
to_pickle(data=lemmatized_2_all, file_name="lemmatized-all", root=SAVE_2)
to_pickle(data=lemmatized_3_all, file_name="lemmatized-all", root=SAVE_3)

#### noun

In [58]:
to_pickle(data=lemmatized_1_noun, file_name="lemmatized-noun", root=SAVE_1)
to_pickle(data=lemmatized_2_noun, file_name="lemmatized-noun", root=SAVE_2)
to_pickle(data=lemmatized_3_noun, file_name="lemmatized-noun", root=SAVE_3)

#### verb

In [59]:
to_pickle(data=lemmatized_1_verb, file_name="lemmatized-verb", root=SAVE_1)
to_pickle(data=lemmatized_2_verb, file_name="lemmatized-verb", root=SAVE_2)
to_pickle(data=lemmatized_3_verb, file_name="lemmatized-verb", root=SAVE_3)

#### adjective

In [60]:
to_pickle(data=lemmatized_1_adjective, file_name="lemmatized-adjective", root=SAVE_1)
to_pickle(data=lemmatized_2_adjective, file_name="lemmatized-adjective", root=SAVE_2)
to_pickle(data=lemmatized_3_adjective, file_name="lemmatized-adjective", root=SAVE_3)

#### adverb

In [61]:
to_pickle(data=lemmatized_1_adverb, file_name="lemmatized-adverb", root=SAVE_1)
to_pickle(data=lemmatized_2_adverb, file_name="lemmatized-adverb", root=SAVE_2)
to_pickle(data=lemmatized_3_adverb, file_name="lemmatized-adverb", root=SAVE_3)

### Tagged data to pickle file

In [62]:
to_pickle(data=tagged_1_, file_name="tagged", root=SAVE_1)
to_pickle(data=tagged_2_, file_name="tagged", root=SAVE_2)
to_pickle(data=tagged_3_, file_name="tagged", root=SAVE_3)