In [3]:
from nltk.tokenize import word_tokenize
import re
# from unidecode import unidecode
import pandas as pd
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from collections import Counter


In [6]:
file_path = "data/preprocessed_article_data.csv"
data = pd.read_csv(file_path, usecols=['content'],)['content'].tolist()

In [9]:
def preprocess_texts(texts, n=0):
    """
    :param texts: Containts the text as string
    :param n: percentage of words that should be removed
    :return: list of the preprocessed text
    """
    processed_texts = []
    for text in texts:
        # lowercasing, keep text only, remove accents, tokenization
        tokens = [word for word in word_tokenize(re.sub(r'[^a-zA-Z\s]', '', unidecode(text.lower())))]
        # stopword removal
        tokens = [token for token in tokens if token not in stopwords.words('english')]
        processed_texts.append(tokens)
    # remove top-n% and bottom-n% words
    if n > 0:
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        top_n = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
        bottom_n = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
        processed_texts = [[word for word in sentence if word not in top_n and word not in bottom_n] for sentence in processed_texts]

    return processed_texts

In [11]:
def filter_word_from_corpus(data, words):
    """
    Filters out specific words from the corpus.
    Parameters:
    - data : The corpus, represented by a list of list of tokens.
    - words : The words to be filtered. Either a list of tokenized words or a single word.
    Returns:
    - List[List[str]]: Filtered data where specified words have been removed.
    Example:
    >> data = [["apple", "orange"], ["apple", "banana"]]
    >> filter_data(data, "apple")
    [["orange"], ["banana"]]
    """
    # Ensure words is a list, even if a single string is passed
    if isinstance(words, str):
        words = [words]
    # Filter words from data
    filtered_data = [[token for token in row if token not in words] for row in data]
    return filtered_data