In [29]:
import pandas as pd
import string
from underthesea import word_tokenize
import re
from concurrent.futures import ThreadPoolExecutor

In [30]:
def load_stopwords(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        return set(file.read().splitlines())

vietnamese_stopwords = load_stopwords("../../data/raw/vietnamese-stopwords.txt")

def remove_punctuation(text):
    return text.translate(str.maketrans(' ', ' ', string.punctuation))

def delete_stopwords_and_clean(doc):
    
    tokens = word_tokenize(remove_punctuation(doc))
    filtered_tokens = [word.lower() for word in tokens if word.lower() not in vietnamese_stopwords and not word.lower().startswith('http')]
    
    # Loại bỏ ký tự đặc biệt và số 
    processed_words = [re.sub(r'[^\w\s]', ' ', word).strip() for word in filtered_tokens if not re.search(r'\d', word)]
    # Loại bỏ '\t', '\n', ...
    processed_words = [re.sub(r'\s+', ' ', word).strip() for word in processed_words if word]

    seen = set()
    unique_words = []
    for word in processed_words:
        if word not in seen and word != ' ':
            unique_words.append(word)
            seen.add(word)

    return unique_words



In [31]:
def process_train_test(args):  # get_text = True lấy nội dung bài báo, get_text = False lấy tiêu đề
    option, get_text = args
    if get_text:
        dir = 'Text'
    else:
        dir = 'Title'

    data_train = pd.read_csv(f'../../data/processed/{dir}/{option}_train.csv')
    data_test = pd.read_csv(f'../../data/processed/{dir}/{option}_test.csv')
    processed_docs_train = [delete_stopwords_and_clean(doc) for doc in data_train[dir].values]
    processed_docs_test = [delete_stopwords_and_clean(doc) for doc in data_test[dir].values]

    data_train['Words'] = processed_docs_train
    data_test['Words'] = processed_docs_test

    data_train.to_csv(f'../../data/cleaned/{dir}/{option}_processed_train.csv', index=False)
    data_test.to_csv(f'../../data/cleaned/{dir}/{option}_processed_test.csv', index=False)
    print("Check")


In [32]:
options = ['Near', 'Mid', 'Far', 'Potential']
get_text = False 

with ThreadPoolExecutor() as executor:
    executor.map(process_train_test, [(option, get_text) for option in options])

Check
Check
Check
Check
