### Import Modules

In [20]:
import torch

import nltk
from nltk.tokenize import TreebankWordTokenizer, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import pickle
import os, re
from collections import defaultdict

# download nltk resources
nltk.download('wordnet')
nltk.download('stopwords')

# set device (MAC)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device('cpu')

[nltk_data] Downloading package wordnet to /Users/kimjin-
[nltk_data]     seong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/kimjin-
[nltk_data]     seong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Dataset

In [2]:
# read file list
train_dataset_path = './dataset/AP_corpus_one_line_per_sentence'
tarin_dataset_list = os.listdir(train_dataset_path)

# init
text = []

# read files
for file in tarin_dataset_list:
    file_path = f'{train_dataset_path}/{file}'
    with open(file_path, 'r') as f:
        text.append(f.read())

### Text Cleaning

In [3]:
def textCleaning(sentence, is_stem=False):
    # get lowercase
    sentence = sentence.lower()

    # tokenization
    #tokenizer = TreebankWordTokenizer()
    tokenizer = RegexpTokenizer(r'\w+')
    sentence = tokenizer.tokenize(sentence)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(token) for token in sentence]

    # stemming
    if is_stem:
        stemmer = PorterStemmer()
        sentence = [stemmer.stem(token) for token in sentence]

    patterns = r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"
    stop_words = set(stopwords.words('english'))

    # removing unicode characters(punctuations)
    sentence = [re.sub(patterns, '', token) for token in sentence if re.sub(patterns, '', token)]

    # removing numbers
    sentence = [re.sub(r'\d+', '', token) for token in sentence if re.sub(r'\d+', '', token)]

    # removing stopwords
    stop_words = set(stopwords.words('english'))
    sentence = [token for token in sentence if token not in stop_words]

    # removing words less than minimum word length
    processed_sentence = [token for token in sentence if len(token)>2]

    # processed_tokens = [token for token in sentence if re.sub(patterns, '', token) and not re.sub(r'\d+', '', token) and token not in stop_words and len(token) > 2]    

    return processed_sentence

def getPadded(tokens, max_len):
    seq_len = len(tokens)
    if seq_len < max_len:
        tokens = tokens + ['[PAD]']*(max_len-seq_len)
    return tokens

In [4]:
text_clean = []
dataset_len = len(text)

for idx, sentence in enumerate(text):
    text_clean.append(textCleaning(sentence, is_stem=True))

    if idx%100==0:
        print(f'{idx+1:5d}/{dataset_len:5d} complete')

    1/ 1049 complete
  101/ 1049 complete
  201/ 1049 complete
  301/ 1049 complete
  401/ 1049 complete
  501/ 1049 complete
  601/ 1049 complete
  701/ 1049 complete
  801/ 1049 complete
  901/ 1049 complete
 1001/ 1049 complete


In [5]:
# get max length
max_len = 0
for tokens in text_clean:
    token_length = len(tokens)

    if token_length > max_len:
        max_len = token_length

# padding
padded_tokens = []

for idx, tokens in enumerate(text_clean):
    padded_tokens.append(getPadded(tokens, max_len))

    if idx%100==0:
        print(f'{idx+1:5d}/{dataset_len:5d} complete')

    1/ 1049 complete
  101/ 1049 complete
  201/ 1049 complete
  301/ 1049 complete
  401/ 1049 complete
  501/ 1049 complete
  601/ 1049 complete
  701/ 1049 complete
  801/ 1049 complete
  901/ 1049 complete
 1001/ 1049 complete


In [6]:
# save files
with open('./dataset/text_list_cleaning.pkl', 'wb') as file:
    pickle.dump(text_clean, file)

with open('./dataset/text_list_cleaning_padded.pkl', 'wb') as file:
    pickle.dump(padded_tokens, file)

# load file
#with open(./dataset/text_list_cleaning.pkl', 'rb') as file:
#    loaded_list = pickle.load(file)

### Create Vocab

In [51]:
# 빈도 계산을 위한 딕셔너리 초기화
word_freq = defaultdict(int)

# 문장을 문자열로 변환 (TfidfVectorizer는 문자열 입력을 받음)
text_clean_str = [' '.join(sequence) for sequence in text_clean]

# TF-IDF Vectorizer 초기화
vectorizer = TfidfVectorizer()

# TF-IDF 값 계산
tfidf_matrix = vectorizer.fit_transform(text_clean_str)

# 각 단어의 TF-IDF 평균값 계산
tfidf_scores = np.mean(tfidf_matrix.toarray(), axis=0)

# 단어와 그에 해당하는 TF-IDF 점수 매핑
vocab_tfidf = {word: score for word, score in zip(vectorizer.get_feature_names_out(), tfidf_scores)}

# TF-IDF 기준 설정
min_tfidf = np.float64(0.00001)
filtered_vocab = [word for word, score in vocab_tfidf.items() if score >= min_tfidf]

# 'PAD'와 'UNK'를 0번과 1번 인덱스에 추가
final_vocab = ['[PAD]', '[UNK]'] + sorted(filtered_vocab)

In [52]:
len(final_vocab)

94440