### Import Modules

In [1]:
import torch

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import os, re

# download nltk resources
nltk.download('wordnet')
nltk.download('stopwords')

# set device (MAC)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')

[nltk_data] Downloading package wordnet to /Users/kimjin-
[nltk_data]     seong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kimjin-
[nltk_data]     seong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Dataset

In [2]:
# read file list
train_dataset_path = './dataset/AP_corpus_one_line_per_sentence'
tarin_dataset_list = os.listdir(train_dataset_path)

# init
text = []

# read files
for file in tarin_dataset_list:
    file_path = f'{train_dataset_path}/{file}'
    with open(file_path, 'r') as f:
        text.append(f.read())

### Text Cleaning

In [3]:
def textCleaning(sentence, is_stem=False):
    # get lowercase
    sentence = sentence.lower()

    # tokenization
    tokenizer = TreebankWordTokenizer()
    sentence = tokenizer.tokenize(sentence)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(token) for token in sentence]

    # stemming
    if is_stem:
        stemmer = PorterStemmer()
        sentence = [stemmer.stem(token) for token in sentence]

    patterns = r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    stop_words = set(stopwords.words('english'))


    # removing unicode characters(punctuations)
    patterns = r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    sentence = [re.sub(patterns, '', token) for token in sentence if re.sub(patterns, '', token)]

    # removing numbers
    sentence = [re.sub(r'\d+', '', token) for token in sentence if re.sub(r'\d+', '', token)]

    # removing stopwords
    stop_words = set(stopwords.words('english'))
    sentence = [token for token in sentence if token not in stop_words]

    # removing words less than minimum word length
    processed_sentence = [token for token in sentence if len(token)>2]

    # processed_tokens = [token for token in sentence if re.sub(patterns, '', token) and not re.sub(r'\d+', '', token) and token not in stop_words and len(token) > 2]    

    return processed_sentence

def getPadded(tokens, max_len):
    seq_len = len(tokens)
    if seq_len < max_len:
        tokens = tokens + ['[PAD]']*(max_len-seq_len)
    return tokens

In [4]:
text_clean = []
dataset_len = len(text)

for idx, sentence in enumerate(text):
    text_clean.append(textCleaning(sentence))

    if idx+1%50 == 0:
        print(f'{idx+1:5d}/{dataset_len:5d} complete')

In [None]:
# get max length
max_len = 0
for tokens in text_clean:
    token_length = len(tokens)

    if token_length > max_len:
        max_len = token_length

# padding
padded_tokens = []

for idx, tokens in enumerate(text_clean):
    padded_tokens.append(getPadded(tokens, max_len))

    if idx+1%50 == 0:
        print(f'{idx+1:5d}/{dataset_len:5d} complete')

### Create Vocab

In [None]:
def getVocab(docs):
    # create vocab using TF-IDF
    tfidfv = TfidfVectorizer().fit(docs)
    vocab = tfidfv.vocabulary_

    # delete a key with a value of 0
    zero_key = [key for key, value in vocab.items() if value == 0 or value == 1]
    for key in zero_key:
        del vocab[key]

    # add [pad] key to vocab
    vocab['[pad]'] = 0

    # add [unk] key to vocab
    vocab['[unk]'] = 1

    return vocab