### 1_data_cleaning

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

df = pd.read_csv("../abcnews-date-text.csv")
#change date format
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], format="%Y%m%d", errors="coerce")
# checlk for duplicates
df[df['headline_text'].duplicated(keep=False)].sort_values('headline_text').head()

Unnamed: 0,publish_date,headline_text
116298,2004-09-20,10 killed in pakistan bus crash
57967,2003-11-29,10 killed in pakistan bus crash
911080,2014-10-23,110 with barry nicholls
672958,2012-02-17,110 with barry nicholls
748629,2012-12-14,110 with barry nicholls


In [27]:
# remove duplicates
df = df.drop_duplicates('headline_text')

In [28]:
# work on a copy of the original dataframe
df_stemmed = df.copy()

### 2_feature_engineering

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

text_col = df.columns[1]
df[text_col] = df[text_col].apply(
    lambda x: re.sub(r"\s+", " ", re.sub(r"\d+", "", x.lower())).strip()
)
df

Unnamed: 0,publish_date,headline_text
0,2003-02-19,aba decides against community broadcasting lic...
1,2003-02-19,act fire witnesses must be aware of defamation
2,2003-02-19,a g calls for infrastructure protection summit
3,2003-02-19,air nz staff in aust strike for pay rise
4,2003-02-19,air nz strike to affect australian travellers
...,...,...
1244179,2021-12-31,two aged care residents die as state records ;
1244180,2021-12-31,victoria records ; new cases and seven deaths
1244181,2021-12-31,wa delays adopting new close contact definition
1244182,2021-12-31,western ringtail possums found badly dehydrate...


In [18]:
# creating the TF-IDF matrix
vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=10,
    max_df=0.5,
    ngram_range=(1, 3),
    stop_words="english",
    token_pattern=r"(?u)\b[a-zA-Z]{2,}\b",
)
tfidf_matrix = vectorizer.fit_transform(df[text_col])

In [20]:
terms = vectorizer.get_feature_names_out()
print(terms[:100])

['aaco' 'aaron' 'abalone' 'abandon' 'abandoned' 'abandons' 'abares'
 'abattoir' 'abbas' 'abbot' 'abbot point' 'abbott' 'abbott says' 'abbotts'
 'abc' 'abc business' 'abc business news' 'abc entertainment'
 'abc learning' 'abc news' 'abc news breakfast' 'abc news quiz'
 'abc reporter' 'abc sport' 'abc weather' 'abcs' 'abducted' 'abduction'
 'abe' 'abetz' 'able' 'ablett' 'aboard' 'aboriginal'
 'aboriginal community' 'aborigines' 'abortion' 'abs' 'absence' 'abu'
 'abuse' 'abuse claims' 'abuse inquiry' 'abuse victims' 'abused' 'abuses'
 'abusing' 'academic' 'academy' 'accc' 'accept' 'accepts' 'access'
 'accident' 'accidental' 'accidentally' 'accidents' 'accommodation'
 'accord' 'account' 'accounts' 'accreditation' 'accusations' 'accuse'
 'accused' 'accused child' 'accused court' 'accused face'
 'accused killing' 'accuses' 'aceh' 'acid' 'acknowledges' 'acl' 'acquired'
 'acquisition' 'acquitted' 'act' 'act budget' 'act election'
 'act government' 'act govt' 'act police' 'acted' 'acting' 'act

### Improving

As you can see, here we have words with same stem: abandon, abandoned, abandons.
So next I will use stemming with nltk (Natural Language Toolkit)

In [None]:
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re

stop_words = ENGLISH_STOP_WORDS
stemmer = SnowballStemmer("english")

def stem_analyzer(text):
    text = text.lower()
    text = re.sub(r"\d+", " ", text)            # remove digits
    text = re.sub(r"[^a-z\s]", " ", text)       # remove punctuation and symbols
    text = re.sub(r"\s+", " ", text)          # normalize whitespace
    tokens = text.split()
    tokens = [t for t in text.split() if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]

    # generate 2-grams and 3-grams
    bigrams = [tokens[i] + " " + tokens[i+1] for i in range(len(tokens)-1)]
    trigrams = [tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] for i in range(len(tokens) - 2)]
    
    return tokens + bigrams + trigrams

In [None]:
vectorizer_stemmed = TfidfVectorizer(
    max_features=10000,
    min_df=10,
    max_df=0.5,
    analyzer=stem_analyzer,
    token_pattern=None  # must be None when using a custom analyzer
)
text_col = df_stemmed.columns[1]
tfidf_matrix_stemmed = vectorizer_stemmed.fit_transform(df_stemmed[text_col])

In [None]:
terms_stemmed = vectorizer_stemmed.get_feature_names_out()
print(terms_stemmed[:100])
print(terms_stemmed.shape)

# checked shape wihtout max_features parameter: 91607 features

['aa' 'aaa' 'aaa credit' 'aaa credit rate' 'aaa rate' 'aaco'
 'aaco abattoir' 'aacta' 'aacta award' 'aamer' 'aami' 'aapt' 'aaron'
 'aaron finch' 'aaron mooy' 'aaron pajich' 'aaron pajich murder'
 'aaron payn' 'aaron sandiland' 'aaron wood' 'ab' 'ab data' 'ab figur'
 'ab job' 'ab june' 'ab villier' 'aba' 'abalon' 'abalon diver'
 'abalon farm' 'abalon fish' 'abalon fisher' 'abalon haul'
 'abalon industri' 'abalon poach' 'abalon poacher' 'abalon season'
 'abalon virus' 'abandon' 'abandon babi' 'abandon car' 'abandon hous'
 'abandon plan' 'abandon toddler' 'abar' 'abar crop' 'abar forecast'
 'abar outlook' 'abar predict' 'abat' 'abattoir' 'abattoir close'
 'abattoir closur' 'abattoir get' 'abattoir open' 'abattoir owner'
 'abattoir plan' 'abattoir reopen' 'abattoir worker' 'abb' 'abb grain'
 'abba' 'abbatoir' 'abbey' 'abbey road' 'abbi' 'abbot' 'abbot point'
 'abbot point coal' 'abbot point dredg' 'abbot point expans' 'abbott'
 'abbott accus' 'abbott address' 'abbott announc' 'abbott arriv

'aa' 'ab' 'ab data' 'ab figur' 'ab job' 'ab june' 'ab villier' 'aba'

I think we should remove terms consisting less than 2 letters even after stemming it seems like random noise

In [29]:
def stem_analyzer2(text):
    text = text.lower()
    text = re.sub(r"\d+", " ", text)            # remove digits
    text = re.sub(r"[^a-z\s]", " ", text)       # remove punctuation and symbols
    text = re.sub(r"\s+", " ", text)          # normalize whitespace
    tokens = text.split()
    tokens = [t for t in text.split() if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]
    
    # remove short tokens consisting of less than 3 characters
    tokens = [t for t in tokens if len(t) >= 3]

    # generate 2-grams and 3-grams
    bigrams = [tokens[i] + " " + tokens[i+1] for i in range(len(tokens)-1)]
    trigrams = [tokens[i] + " " + tokens[i+1] + " " + tokens[i+2] for i in range(len(tokens) - 2)]
    
    return tokens + bigrams + trigrams

In [30]:
vectorizer_stemmed = TfidfVectorizer(
    max_features=10000,
    min_df=10,
    max_df=0.5,
    analyzer=stem_analyzer2,
    token_pattern=None  # must be None when using a custom analyzer
)
text_col = df_stemmed.columns[1]
tfidf_matrix_stemmed = vectorizer_stemmed.fit_transform(df_stemmed[text_col])

In [33]:
vectorizer_stemmed.get_feature_names_out()[:100]

array(['aaco', 'aaron', 'abalon', 'abandon', 'abar', 'abattoir', 'abba',
       'abbot', 'abbot point', 'abbott', 'abbott say', 'abc',
       'abc journalist', 'abc learn', 'abc news', 'abc news breakfast',
       'abc news quiz', 'abc radio', 'abc report', 'abduct', 'abe',
       'abetz', 'abil', 'abl', 'ablett', 'aboard', 'abolish', 'aborigin',
       'aborigin communiti', 'aborigin elder', 'aborigin land', 'abort',
       'absenc', 'absolut', 'abu', 'abu ghraib', 'abus', 'abus alleg',
       'abus case', 'abus charg', 'abus claim', 'abus inquiri',
       'abus report', 'abus royal', 'abus royal commiss', 'abus survivor',
       'abus victim', 'academ', 'academi', 'accc', 'acceler', 'accept',
       'access', 'accid', 'accident', 'accommod', 'accord', 'account',
       'accredit', 'accus', 'accus assault', 'accus child', 'accus court',
       'accus deni', 'accus drug', 'accus face', 'accus face court',
       'accus govt', 'accus kill', 'accus murder', 'accus rape',
       'accus tr

Now I think it seems better

After consideration and some noises in clustering I was thinking, I don't have enought dimension:

So I increased SVD 200 -> 300: result was worse

Now I am thinking maybe trigrams are causing some noise, and preventing from getting other useful features

In [44]:
trigrams = [t for t in vectorizer_stemmed.get_feature_names_out()
            if len(t.split()) == 3]

len(trigrams)

206

We have only 206 trigrams, meaning they are few and rare, which means they are less likely affect 10K features.

### 3_Dimensionality Reduction

In [None]:
import pickle
from sklearn.decomposition import TruncatedSVD

# with open("../outputs/tfidf_matrix.pkl", "rb") as f:
#     tfidf_matrix = pickle.load(f)

# with open("../outputs/tfidf_vectorizer.pkl", "rb") as f:
#     vectorizer = pickle.load(f)

I wanted to check how much variance is preserved

In [35]:
for k in [50, 100, 200, 300]:
    svd_k = TruncatedSVD(n_components=k, random_state=42)
    svd_k.fit(tfidf_matrix_stemmed)
    print(k, svd_k.explained_variance_ratio_.sum())

50 0.06598297508201635
100 0.11256041703815221
200 0.18517767080470943
300 0.2439157964019349


But it works too long, so next I'll do larger k numbers but with sampling

In [38]:
import numpy as np

n_samples = 250000
rows = np.random.choice(tfidf_matrix_stemmed.shape[0], size=n_samples, replace=False)
tfidf_sample = tfidf_matrix_stemmed[rows]

In [None]:
ks = [100, 200, 300, 400, 500]
results = {}

for k in ks:
    svd_k = TruncatedSVD(n_components=k, random_state=42)
    svd_k.fit(tfidf_sample)
    explained = svd_k.explained_variance_ratio_.sum()
    results[k] = explained
    print(f"k={k}: explained variance = {explained:.4f}")

# Storing in svd_results.txt
with open("../outputs/svd_results.txt", "w") as f:
    for k, val in results.items():
        f.write(f"k = {k}: explained variance = {val:.6f}\n")

k=100: explained variance = 0.1131
k=200: explained variance = 0.1857
k=300: explained variance = 0.2447
k=400: explained variance = 0.2944
k=500: explained variance = 0.3373


This evaluation with SAMPLE gives similar results as without it, so it is accurate.

However, picking up larger k number would make further computations costly: 

So we would stick to 200, because 300 gives extra 6% variance but with trade-off 50% more dimensions. 

In [41]:
svd = TruncatedSVD(n_components=200, random_state=42)
lsa = svd.fit_transform(tfidf_matrix_stemmed)

### 4 Clustering_and_evaluation

(1213004, 200)