In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"), categories=["sci.space", "rec.sport.baseball"])

In [None]:
sample = data["data"][:10]
sample

In [None]:
import re
import nltk
from nltk.corpus import stopwords

def preprocess(text: str, stop_words: set) -> str:
    """
    Performs basic pre-processing.
    Lowercases text, removes non-words and stopwords.

    Parameters
    ----------
    text : str
        String to be preprocessed. Expected to be raw text from `communications.csv`.
    stop_words : set
        Set of English stopwords from nltk.

    Returns
    -------
    str
        Preprocessed string.
    """
    text = re.sub(r'[^a-zA-Z_]', ' ', text) # removes non-words
    words = text.lower().split()
    filtered_words = [w for w in words if w not in stop_words]
    return " ".join(filtered_words)


def stem(text: str, stemmer) -> str:
    """
    Stems the string.

    Parameters
    ----------
    text : str
        String to be stemmed. Expected to be preprocessed text from `communications_preprocessed.csv`.

    Returns
    -------
    str
        String containing stemmed words.
    """
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(t) for t in tokens])

In [None]:
stop_words = set(stopwords.words("english"))
stemmer = nltk.stem.SnowballStemmer("english")

preprocessed_text = [preprocess(s, stop_words=stop_words) for s in sample]
stemmed_text = [stem(s, stemmer=stemmer) for s in preprocessed_text]
stemmed_text

In [None]:
import polars as pl

df = pl.DataFrame({"raw_text": sample, "clean_text": preprocessed_text, "stemmed_text": stemmed_text})
df

In [None]:
df.write_csv("../data/dummy_data.csv")