In [1]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
data = fetch_20newsgroups(subset="all", remove=("headers", "footers", "quotes"), categories=["sci.space", "rec.sport.baseball"])

In [7]:
data["data"][:10]

["\nDo you really have *that* much information on him?  Really?\n\n\nI don't know.  You tell me.  What percentage of players reach or \nexceed their MLE's *in their rookie season*?  We're talking about\n1993, you know.\n\n\nIf that were your purpose, maybe.  Offerman spent 1992 getting \nacclimated, if you will.  The Dodgers as a team paid a big price\nthat season.  Perhaps they will reap the benefits down the road.\nDo you really think they would have done what they did if they\nwere competing for a pennant?\n\n\nFor a stat-head, I'm amazed that you put any credence in spring\ntraining.  Did you notice who he got those 10 (!) hits off of, or\nare you going to tell me that it doesn't make a difference?\n\n\nWait a minute.  I missed something here.  First, forget Keith\nMitchell.  Are you saying that a kid who moves from AA to AAA\nand then does not improve would have been better off making a\ndirect leap to the majors?  If a player does well at AA and then\ndoes not improve at AAA, isn

In [8]:
import re
import nltk
from nltk.corpus import stopwords

def preprocess(text: str, stop_words: set) -> str:
    """
    Performs basic pre-processing.
    Lowercases text, removes non-words and stopwords.

    Parameters
    ----------
    text : str
        String to be preprocessed. Expected to be raw text from `communications.csv`.
    stop_words : set
        Set of English stopwords from nltk.

    Returns
    -------
    str
        Preprocessed string.
    """
    text = re.sub(r'[^a-zA-Z_]', ' ', text) # removes non-words
    words = text.lower().split()
    filtered_words = [w for w in words if w not in stop_words]
    return " ".join(filtered_words)


def stem(text: str, stemmer) -> str:
    """
    Stems the string.

    Parameters
    ----------
    text : str
        String to be stemmed. Expected to be preprocessed text from `communications_preprocessed.csv`.

    Returns
    -------
    str
        String containing stemmed words.
    """
    tokens = nltk.word_tokenize(text)
    return " ".join([stemmer.stem(t) for t in tokens])

In [None]:
stop_words = set(stopwords.words("english"))