# English Data PreProcessing

## [주요 고려 사항]
1. `dot(.)`과 `apostrophe(')` 처리
    - 'u.s.'와 'u.s.s.r.'과 같은 약자처리를 어떻게 할 것인가?
    - 'america's'와 같은 소유격을 어떻게 처리할 것인가?
        1. 처음 Cleaning 때, `dot(.)`과 `apostrophe(')`는 제거하지 않음
            - `dot(.)`
                - 'u.s', 'u.s.s.r'과 같은 약자를 유지시키기 위한 처리
            - `apostrophe(')`
                - 'america's'와 같은 소유격을 유지시켜서 Tokenizing때 's를 분리시키기 위함.
        2. Tokenizing 이후, `dot(.)`과 `apostrophe(')`를 유지시켜야 하는 Token들 외에는 특수문자 제거
            1. `apostrophe(')`와 `dot(.)`을 가진 Token들을 출력해보고 유지시킬 Token들의 목록을 결정
            2. `apostrophe(')`를 유지시킬 Token들 외의 모든 Token들에서 `apostrophe(')` 및 특수문자 제거
                - `dot(.)`은 다음 단계에서 예외처리를 하며 제거해야 하므로, 이 단계에서는 모든 `dot(.)`을 유지시킴
            3. `dot(.)`을 유지시킬 Token들 외의 모든 Token들에서 `dot(.)` 및 특수문자 제거

## 1. Module Import

In [1]:
# self defined Modules
from myModules.utils import DataLoader, merge
from myModules.preprocess.english import cleaning, remove_stopwords, tagging, dot_and_apostrophe, convert_pos, lemmatization, to_pickle, to_csv, check_pos

# General Modules
import pandas as pd
import numpy as np
import warnings
from tqdm.notebook import tqdm
import pickle
import re
import glob

warnings.filterwarnings('ignore')

# Read File
import glob

# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2. Data Loader

In [2]:
DATA_ROOT = './Data/3구간/'

PERIOD_1 = DATA_ROOT + '1시기/1시기_ST/'
PERIOD_2 = DATA_ROOT + '2시기/2시기_ST/'
PERIOD_3 = DATA_ROOT + '3시기/3시기_ST/'

RESULT_ROOT = './Result/3구간/'

RESULT_1 = RESULT_ROOT + '/1시기/ST/'
RESULT_2 = RESULT_ROOT + '/2시기/ST/'
RESULT_3 = RESULT_ROOT + '/3시기/ST/'

In [3]:
files_list_1 = glob.glob(PERIOD_1+'*.txt')
files_list_2 = glob.glob(PERIOD_2+'*.txt')
files_list_3 = glob.glob(PERIOD_3+'*.txt')

texts_1 = DataLoader(files_list_1, mode='ST')
texts_2 = DataLoader(files_list_2, mode='ST')
texts_3 = DataLoader(files_list_3, mode='ST')

## 3. PreProcess

### 3-1. Data Cleaning

- `dot(.)`과 `apostrophe(')`는 제거하지 않음

In [4]:
cleaned_1 = cleaning(data=texts_1)
cleaned_2 = cleaning(data=texts_2)
cleaned_3 = cleaning(data=texts_3)

### 3-2. Tokenizing

In [5]:
tokenized_1 = [word_tokenize(text) for text in cleaned_1]
tokenized_2 = [word_tokenize(text) for text in cleaned_2]
tokenized_3 = [word_tokenize(text) for text in cleaned_3]

#### Period 1

In [6]:
symbol = dot_and_apostrophe(data=tokenized_1)

##### apostrophe와 dot을 가진 token들 시각화

In [7]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"'liberty", "'system", "'into", "'german", "'d", "'", "'s", "'are", "o'clock", "'heat", "'ll", "'ve", "'m", "'blamed", "'mvd", "'structure", "n't", "'democracy", "'madam"}
dot을 가진 token : 
{'...', 'camps.if', 'n.', 'a.m.', 'u.n.', 'mr.', 'gen.', 'co.', '..', 'st.', 'mrs.', 'i.', 'dr.', 'u.s.s.r.', 'p.m.', 't.', 'p.', 'frightened.to', 'jr.', 's.', 'f.', 'w.', 'm.', 'ph.d.', 'col.', 'e.', 'messrs.', 'oct.', '.', 'v.', 'u.s.', 'a.', 'u.'}


##### exception 목록 설정

In [8]:
apostrophe_exception = ["'ll", "'s", "'ve", "n't"]
dot_exception = ["u.s.s.r.", "dr.", "messrs.", "gen.", "u.n.", "a.m.", "st.", "u.s.", "ph.d", "jr.", "p.m.", "mrs.", "mr."]

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [9]:
symbol.print_exception()

apostrophe exceptions : 
["'ll", "'s", "'ve", "n't"]
dot exceptions : 
['u.s.s.r.', 'dr.', 'messrs.', 'gen.', 'u.n.', 'a.m.', 'st.', 'u.s.', 'ph.d', 'jr.', 'p.m.', 'mrs.', 'mr.']


##### apostrophe 처리

In [10]:
tokenized_1_ = symbol.remove_apostrophe(data=tokenized_1)

Processed Tokens : 
{"'structure", "'are", "'democracy", "'german", "'liberty", "'d", "'", "'m", "'blamed", "'mvd", "'system", "'madam", "'into", "o'clock", "'heat"}


##### dot 처리

In [11]:
tokenized_1__ = symbol.remove_dot(data=tokenized_1_)

Processed Tokens : 
{'', '...', 'camps.if', 'n.', 'co.', '..', 'i.', 't.', 'p.', 'frightened.to', 's.', 'f.', 'w.', 'm.', 'ph.d.', 'col.', 'e.', 'oct.', '.', 'v.', 'a.', 'u.'}


##### 제거해야할 token 검사

In [12]:
symbol.check_invalid_tokens(data=tokenized_1__)

Remaining invalid Symbol : {'', 'x', 'o', 'i', 't', 'p', 'g', 'h', 'j', 'v', 'w', 'u', 'a', 'k', 'b', 'y', 'f', 's', 'e', 'm', 'n', 'r', 'd'}


##### 길이가 1이거나 필요없는 특수문자인 Token들 삭제

In [13]:
tokenized_1___ = symbol.remove_invalid_tokens(data=tokenized_1__)

Removed Tokens : 
{'', 'x', 'o', 'i', 't', 'p', 'g', 'h', 'j', 'v', 'w', 'u', 'a', 'k', 'b', 'y', 'f', 's', 'e', 'm', 'n', 'r', 'd'}


##### 남아있는 invalid한 token이 있는지 검사

In [14]:
symbol.check_invalid_tokens(data=tokenized_1___)

There is no invalid symbol


#### Peiod 2

In [15]:
symbol = dot_and_apostrophe(data=tokenized_2)

##### apostrophe와 dot을 가진 token들 시각화

In [16]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"'for", "n't", "'", "'m", "'s", "'reprisals"}
dot을 가진 token : 
{'n.', 'mr.', 'gen.', 'b.', '..', 'l.', 'i.', 'dr.', 'tyranny.the', 'u.s.s.r.', 'u.s.a.', 'p.m.', 'p.', 't.', 'o.', 'g.', 'h.', 'c.', 's.', 'w.', 'm.', 'e.', 'messrs.', '.', 'v.', 'a.', 'r.'}


##### exception 목록 설정

In [17]:
apostrophe_exception = ["'s", "n't"]
dot_exception = ["u.s.s.r.", "dr.", "messrs.", "gen.", "u.s.a.", "p.m.", "mr."]

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [18]:
symbol.print_exception()

apostrophe exceptions : 
["'s", "n't"]
dot exceptions : 
['u.s.s.r.', 'dr.', 'messrs.', 'gen.', 'u.s.a.', 'p.m.', 'mr.']


##### apostrophe 처리

In [19]:
tokenized_2_ = symbol.remove_apostrophe(data=tokenized_2)

Processed Tokens : 
{"'for", "'reprisals", "'", "'m"}


##### dot 처리

In [20]:
tokenized_2__ = symbol.remove_dot(data=tokenized_2_)

Processed Tokens : 
{'', 'n.', 'b.', '..', 'l.', 'i.', 'tyranny.the', 'p.', 't.', 'o.', 'g.', 'h.', 'c.', 's.', 'w.', 'm.', 'e.', '.', 'v.', 'a.', 'r.'}


##### 제거해야할 Token들 검사

In [21]:
symbol.check_invalid_tokens(data=tokenized_2__)

Remaining invalid Symbol : {'', 'o', 'i', 't', 'p', 'g', 'h', 'v', 'w', 'a', 'b', 'f', 's', 'e', 'c', 'm', 'n', 'r', 'd', 'l'}


##### 길이가 1이거나 필요없는 특수문자인 token 제거

In [22]:
tokenized_2___ = symbol.remove_invalid_tokens(data=tokenized_2__)

Removed Tokens : 
{'', 'o', 'i', 't', 'p', 'g', 'h', 'v', 'w', 'a', 'b', 'f', 's', 'e', 'c', 'm', 'n', 'r', 'd', 'l'}


##### 남아있는 Invalid한 Token이 있는지 확인

In [23]:
symbol.check_invalid_tokens(data=tokenized_2___)

There is no invalid symbol


#### period 3

In [24]:
symbol = dot_and_apostrophe(tokenized_3)

##### apostrophe와 dot을 가진 token들 시각화

In [25]:
symbol.token_with_apostrophe()
symbol.token_with_dot()

apostrophe를 가진 token : 
{"o'clock", "'ll", "'d", "n't", "'ve", "'", "'s", "'spontaneous", "'has", "'vas", "'recession"}
dot을 가진 token : 
{'..................', '...', 'n.', 'a.m.', 'mr.', 'gen.', 'b.', 'prof.', 'st.', 'mrs.', 'l.', 'r.', 'u.n.r.r.a', 'i.', 'dr.', 'd.', 'u.s.s.r.', 'p.m.', 'p.', 't.', 'o.', 'g.', 'h.', 'c.', 's.', 'jr.', 'f.', 'j.', 'm.', 'w.', 'col.', 'e.', '.', 'v.', 's.s.r', 'maj.', 'a.', 'u.'}


##### exception 목록 설정

In [26]:
apostrophe_exception = ["'ll", "'s", "'ve", "n't"]
dot_exception = ["u.s.s.r.", "dr.", "s.s.r", "a.m.", "st.", "prof.", "u.n.r.r.a", "jr.", "maj.", "p.m.", "mrs.", "mr."]

symbol.set_exception(apostrophe_exception=apostrophe_exception, dot_exception=dot_exception)

In [27]:
symbol.print_exception()

apostrophe exceptions : 
["'ll", "'s", "'ve", "n't"]
dot exceptions : 
['u.s.s.r.', 'dr.', 's.s.r', 'a.m.', 'st.', 'prof.', 'u.n.r.r.a', 'jr.', 'maj.', 'p.m.', 'mrs.', 'mr.']


##### apostrophe 처리

In [28]:
tokenized_3_ = symbol.remove_apostrophe(tokenized_3)

Processed Tokens : 
{"o'clock", "'d", "'", "'spontaneous", '``', "'vas", "'has", "'recession"}


##### dot 처리

In [29]:
tokenized_3__ = symbol.remove_dot(tokenized_3_)

Processed Tokens : 
{'', '..................', '...', 'n.', 'gen.', 'u.', 'b.', 'l.', 'i.', 'd.', 'p.', 't.', 'o.', 'g.', 'h.', 'c.', 's.', 'f.', 'j.', 'm.', 'w.', 'col.', 'e.', '.', 'v.', 'a.', 'r.'}


##### 제거해야할 token 확인

In [30]:
symbol.check_invalid_tokens(tokenized_3__)

Remaining invalid Symbol : {'', 'x', 'o', 'i', 't', 'p', 'g', 'h', 'j', 'v', 'w', 'u', 'a', 'b', 'f', 's', 'e', 'c', 'm', 'n', 'r', 'd', 'l'}


##### 길이가 1이거나 필요없는 특수문자인 token 제거

In [31]:
tokenized_3___ = symbol.remove_invalid_tokens(tokenized_3__)

Removed Tokens : 
{'', 'x', 'o', 'i', 't', 'p', 'g', 'h', 'j', 'v', 'w', 'u', 'a', 'b', 'f', 's', 'e', 'c', 'm', 'n', 'r', 'd', 'l'}


##### 남아있는 INvalid한 token이 있는지 확인

In [32]:
symbol.check_invalid_tokens(tokenized_3___)

There is no invalid symbol


### 3-3. Remove StopWords

In [33]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = ['would', 'could', 'might', 'need', 'can', 'must', \
    'one', 'two', 'upon', 'may', 'perhaps', 'living', 'seem', 'also', 'ii', 'ofthe',
    'also', 'much', 'therefore', "'ll", "'ve", "n't"]

wo_stopword_1 = remove_stopwords(tokenized_1___, stopwords, new_stopwords)
wo_stopword_2 = remove_stopwords(tokenized_2___, stopwords, new_stopwords)
wo_stopword_3 = remove_stopwords(tokenized_3___, stopwords, new_stopwords)

### 3-4. Tagging

In [34]:
pos_table = pd.read_pickle("processed-data/pos-table.pkl")

In [35]:
tagged_1 = tagging(wo_stopword_1)
tagged_2 = tagging(wo_stopword_2)
tagged_3 = tagging(wo_stopword_3)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

#### Period 1

In [36]:
pos = check_pos(tagged_1)

In [37]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{'p.m.': {'RB'}, 'st.': {'NN', 'JJ'}, 'messrs.': {'NN'}, 'mrs.': {'NNS'}, 'a.m.': {'JJ'}, 'u.n.': {'NN'}, 'mr.': {'RB', 'NN', 'NNP', 'RBS', 'JJ', 'VBP'}, 'gen.': {'NN', 'JJ', 'VBP'}, 'dr.': {'JJ', 'VBP'}, 'u.s.': {'JJ'}, 'jr.': {'NN', 'VBP'}, 'u.s.s.r.': {'JJ', 'VBP'}}


In [38]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{'pm': ['NN'], 'st': ['NN'], 'messrs': ['NN'], 'mrs': ['NN'], 'am': ['VBP'], 'un': ['NN'], 'mr': ['NN'], 'gen': ['NN'], 'dr': ['NN'], 'us': ['PRP'], 'jr': ['NN'], 'ussr': ['NN']}


#### Period 2

In [39]:
pos = check_pos(tagged_2)

In [40]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{'p.m.': {'JJ'}, 'messrs.': {'NNS'}, 'mr.': {'NN', 'NNS', 'NNP', 'RBS', 'VBZ', 'FW', 'VB', 'JJ', 'VBP'}, 'gen.': {'JJ'}, 'dr.': {'NN'}, 'u.s.s.r.': {'JJ'}, 'u.s.a.': {'NN'}}


In [41]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{'pm': ['NN'], 'messrs': ['NN'], 'mr': ['NN'], 'gen': ['NN'], 'dr': ['NN'], 'ussr': ['NN'], 'usa': ['NN']}


#### Period 3

In [42]:
pos = check_pos(tagged_3)

In [43]:
pos.pos_with_symbol()

tagged token with apostrophe : 
{"'s": {'POS'}}
tagged token with dot : 
{'s.s.r': {'NN'}, 'p.m.': {'RB', 'NN', 'VBP'}, 'st.': {'JJ'}, 'mrs.': {'NN'}, 'u.n.r.r.a': {'RB', 'JJ'}, 'a.m.': {'RB', 'NN', 'VBD'}, 'mr.': {'RB', 'NN', 'VBD', 'NNS', 'NNP', 'RBS', 'VBZ', 'RBR', 'FW', 'VB', 'JJ', 'VBP'}, 'jr.': {'NN'}, 'dr.': {'NN', 'JJ', 'VBZ', 'VBP'}, 'u.s.s.r.': {'JJ'}, 'maj.': {'NN'}, 'prof.': {'NN'}}


In [44]:
pos.pos_without_symbol()

tagged token without apostrophe : 
{'s': ['NN']}
tagged token without dot : 
{'ssr': ['NN'], 'pm': ['NN'], 'st': ['NN'], 'mrs': ['NN'], 'unrra': ['NN'], 'am': ['VBP'], 'mr': ['NN'], 'jr': ['NN'], 'dr': ['NN'], 'ussr': ['NN'], 'maj': ['NN'], 'prof': ['NN']}


### 3-5. adress POS of token with symbols

In [45]:
tagged_1_ = convert_pos(data=tagged_1, key=".", target_pos="NN")
tagged_2_ = convert_pos(data=tagged_2, key=".",  target_pos="NN")
tagged_3_ = convert_pos(data=tagged_3, key=".",  target_pos="NN")

### 3-6. Lemmatization

In [46]:
lemmatizer = WordNetLemmatizer()

#### All pos

In [47]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_1_all = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_2_all = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['noun', 'verb', 'adjective', 'adverb'])
lemmatized_3_all = lemmatize.lemmatize()

#### Nouns

In [48]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_1_noun = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_2_noun = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['noun'])
lemmatized_3_noun = lemmatize.lemmatize()

#### Verbs

In [49]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_1_verb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_2_verb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['verb'])
lemmatized_3_verb = lemmatize.lemmatize()

#### Adjectives

In [50]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_1_adjective = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_2_adjective = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['adjective'])
lemmatized_3_adjective = lemmatize.lemmatize()

#### Adverbs

In [51]:
lemmatize = lemmatization(tagged_1, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_1_adverb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_2, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_2_adverb = lemmatize.lemmatize()

lemmatize = lemmatization(tagged_3, lemmatizer, pos_table, allowed_pos=['adverb'])
lemmatized_3_adverb = lemmatize.lemmatize()

## 4. Save PreProcessed Data

In [52]:
SAVE_ROOT = './processed-data/'

SAVE_1 = SAVE_ROOT + 'period-1/'
SAVE_2 = SAVE_ROOT + 'period-2/'
SAVE_3 = SAVE_ROOT + 'period-3/'

### 4-1. Preprocessed data to pickle file

#### all pos

In [53]:
to_pickle(data=lemmatized_1_all, file_name="lemmatized-all", root=SAVE_1)
to_pickle(data=lemmatized_2_all, file_name="lemmatized-all", root=SAVE_2)
to_pickle(data=lemmatized_3_all, file_name="lemmatized-all", root=SAVE_3)

#### noun

In [54]:
to_pickle(data=lemmatized_1_noun, file_name="lemmatized-noun", root=SAVE_1)
to_pickle(data=lemmatized_2_noun, file_name="lemmatized-noun", root=SAVE_2)
to_pickle(data=lemmatized_3_noun, file_name="lemmatized-noun", root=SAVE_3)

#### verb

In [55]:
to_pickle(data=lemmatized_1_verb, file_name="lemmatized-verb", root=SAVE_1)
to_pickle(data=lemmatized_2_verb, file_name="lemmatized-verb", root=SAVE_2)
to_pickle(data=lemmatized_3_verb, file_name="lemmatized-verb", root=SAVE_3)

#### adjective

In [56]:
to_pickle(data=lemmatized_1_adjective, file_name="lemmatized-adjective", root=SAVE_1)
to_pickle(data=lemmatized_2_adjective, file_name="lemmatized-adjective", root=SAVE_2)
to_pickle(data=lemmatized_3_adjective, file_name="lemmatized-adjective", root=SAVE_3)

#### adverb

In [57]:
to_pickle(data=lemmatized_1_adverb, file_name="lemmatized-adverb", root=SAVE_1)
to_pickle(data=lemmatized_2_adverb, file_name="lemmatized-adverb", root=SAVE_2)
to_pickle(data=lemmatized_3_adverb, file_name="lemmatized-adverb", root=SAVE_3)

### 4-2. Preprocessed data to csv file

#### all pos

In [58]:
to_csv(data=lemmatized_1_all, file_name="lemmatized-all", root=SAVE_1)
to_csv(data=lemmatized_2_all, file_name="lemmatized-all", root=SAVE_2)
to_csv(data=lemmatized_3_all, file_name="lemmatized-all", root=SAVE_3)

#### noun

In [59]:
to_csv(data=lemmatized_1_noun, file_name="lemmatized-noun", root=SAVE_1)
to_csv(data=lemmatized_2_noun, file_name="lemmatized-noun", root=SAVE_2)
to_csv(data=lemmatized_3_noun, file_name="lemmatized-noun", root=SAVE_3)

#### verb

In [60]:
to_pickle(data=lemmatized_1_verb, file_name="lemmatized-verb", root=SAVE_1)
to_pickle(data=lemmatized_2_verb, file_name="lemmatized-verb", root=SAVE_2)
to_pickle(data=lemmatized_3_verb, file_name="lemmatized-verb", root=SAVE_3)

#### adjective

In [61]:
to_csv(data=lemmatized_1_adjective, file_name="lemmatized-adjective", root=SAVE_1)
to_csv(data=lemmatized_2_adjective, file_name="lemmatized-adjective", root=SAVE_2)
to_csv(data=lemmatized_3_adjective, file_name="lemmatized-adjective", root=SAVE_3)

#### adverb

In [62]:
to_csv(data=lemmatized_1_adverb, file_name="lemmatized-adverb", root=SAVE_1)
to_csv(data=lemmatized_2_adverb, file_name="lemmatized-adverb", root=SAVE_2)
to_csv(data=lemmatized_3_adverb, file_name="lemmatized-adverb", root=SAVE_3)

### 4-3. Tagged data to pickle file

In [63]:
to_pickle(data=tagged_1_, file_name="tagged", root=SAVE_1)
to_pickle(data=tagged_2_, file_name="tagged", root=SAVE_2)
to_pickle(data=tagged_3_, file_name="tagged", root=SAVE_3)

In [64]:
to_csv(data=tagged_1_, file_name="tagged", root=SAVE_1)
to_csv(data=tagged_2_, file_name="tagged", root=SAVE_2)
to_csv(data=tagged_3_, file_name="tagged", root=SAVE_3)