### 1_data_cleaning

In [21]:
import pandas as pd

df = pd.read_csv("../abcnews-date-text.csv")
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], format="%Y%m%d", errors="coerce")
df_stemmed = df.copy()

### 2_feature_engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

text_col = df.columns[1]
df[text_col] = df[text_col].apply(
    lambda x: re.sub(r"\s+", " ", re.sub(r"\d+", "", x.lower())).strip()
)
df

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers
...,...,...
1244179,2021-12-31,two aged care residents die as state records ;
1244180,2021-12-31,victoria records ; new cases and seven deaths
1244181,2021-12-31,wa delays adopting new close contact definition
1244182,2021-12-31,western ringtail possums found badly dehydrate...


In [18]:
# creating the TF-IDF matrix
vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=10,
    max_df=0.5,
    ngram_range=(1, 3),
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z]{2,}\b",
)
tfidf_matrix = vectorizer.fit_transform(df[text_col])

In [20]:
terms = vectorizer.get_feature_names_out()
print(terms[:100])

['aaco' 'aaron' 'abalone' 'abandon' 'abandoned' 'abandons' 'abares'
 'abattoir' 'abbas' 'abbot' 'abbot point' 'abbott' 'abbott says' 'abbotts'
 'abc' 'abc business' 'abc business news' 'abc entertainment'
 'abc learning' 'abc news' 'abc news breakfast' 'abc news quiz'
 'abc reporter' 'abc sport' 'abc weather' 'abcs' 'abducted' 'abduction'
 'abe' 'abetz' 'able' 'ablett' 'aboard' 'aboriginal'
 'aboriginal community' 'aborigines' 'abortion' 'abs' 'absence' 'abu'
 'abuse' 'abuse claims' 'abuse inquiry' 'abuse victims' 'abused' 'abuses'
 'abusing' 'academic' 'academy' 'accc' 'accept' 'accepts' 'access'
 'accident' 'accidental' 'accidentally' 'accidents' 'accommodation'
 'accord' 'account' 'accounts' 'accreditation' 'accusations' 'accuse'
 'accused' 'accused child' 'accused court' 'accused face'
 'accused killing' 'accuses' 'aceh' 'acid' 'acknowledges' 'acl' 'acquired'
 'acquisition' 'acquitted' 'act' 'act budget' 'act election'
 'act government' 'act govt' 'act police' 'acted' 'acting' 'act

### Improving

As you can see, here we have words with same stem: abandon, abandoned, abandons.
So next I will use stemming with nltk (Natural Language Toolkit)

In [29]:
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

stop_words = ENGLISH_STOP_WORDS
stemmer = SnowballStemmer("english")

def stem_analyzer(text):
    text = text.lower()
    text = re.sub(r"\d+", " ", text)            # remove digits
    text = re.sub(r"[^a-z\s]", " ", text)       # remove punctuation and symbols
    text = re.sub(r"\s+", " ", text)          # normalize whitespace
    tokens = text.split()
    tokens = [t for t in text.split() if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]

    # generate 2-grams and 3-grams
    bigrams = [tokens[i] + " " + tokens[i+1] for i in range(len(tokens)-1)]
    trigrams = [tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] for i in range(len(tokens) - 2)]
    
    return tokens + bigrams + trigrams

In [30]:
vectorizer_stemmed = TfidfVectorizer(
    max_features=10000,
    min_df=10,
    max_df=0.5,
    analyzer=stem_analyzer,
    token_pattern=None  # must be None when using a custom analyzer
)
text_col = df_stemmed.columns[1]
tfidf_matrix_stemmed = vectorizer_stemmed.fit_transform(df_stemmed[text_col])

In [31]:
terms_stemmed = vectorizer_stemmed.get_feature_names_out()
print(terms_stemmed[:100])

['aaco' 'aaron' 'ab' 'abalon' 'abandon' 'abar' 'abattoir' 'abba' 'abbot'
 'abbot point' 'abbott' 'abbott say' 'abc' 'abc busi' 'abc busi news'
 'abc entertain' 'abc learn' 'abc news' 'abc news breakfast'
 'abc news quiz' 'abc radio' 'abc report' 'abc sport' 'abc weather'
 'abduct' 'abe' 'abetz' 'abil' 'abl' 'ablett' 'aboard' 'abolish'
 'aborigin' 'aborigin communiti' 'aborigin elder' 'aborigin land' 'abort'
 'absenc' 'abu' 'abu ghraib' 'abus' 'abus alleg' 'abus case' 'abus charg'
 'abus claim' 'abus inquiri' 'abus report' 'abus royal'
 'abus royal commiss' 'abus survivor' 'abus victim' 'ac' 'academ'
 'academi' 'accc' 'acceler' 'accept' 'access' 'accid' 'accident'
 'accommod' 'accord' 'account' 'accredit' 'accus' 'accus assault'
 'accus child' 'accus court' 'accus deni' 'accus face' 'accus face court'
 'accus kill' 'accus murder' 'accus rape' 'ace' 'aceh' 'achiev' 'acid'
 'acknowledg' 'acl' 'acquir' 'acquisit' 'acquit' 'acquitt' 'act'
 'act budget' 'act elect' 'act govern' 'act govt' 'a