# Resources:
- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

import string

In [10]:
# read in data
TEST_FP = "datasets/MINDlarge_train/news.tsv"
SUPABASE_FP = "datasets/supabase_data/data.csv"

DATA_FP = SUPABASE_FP

if DATA_FP == TEST_FP:
    separator = '\t'
else:
    separator = ','

data = pd.read_csv(DATA_FP, sep=separator)

data.head(3)

Unnamed: 0,id,title,abstract
0,262,"Mugshots of the week: Aug. 13-19, 2023",Arrests were made and mugshots were taken thro...
1,568,House Republicans Begin Investigating Willis a...,"The launch of an inquiry into Fani Willis, the..."
2,1355,Wisconsin deer farm infected with fatal brain ...,"A 150-acre deer farm in Washburn County, Wisco..."


In [11]:
if DATA_FP == TEST_FP:
    # add column labels
    data.columns = ['id', 'category', 'subcategory', 'title', 'description', 'url', 'title entities', 'abstract entities']

    # remove data from specific topics
    filter = ~data['category'].isin(['sports'])
    data = data[filter]

elif DATA_FP == SUPABASE_FP:
    # add column labels
    data.columns = ['id', 'title', 'description']

data = data.fillna("")

combined = pd.DataFrame(data["title"] + "; " + data["description"], columns=["text"])
combined['id'] = data['id']
combined.head(3)

Unnamed: 0,text,id
0,"Mugshots of the week: Aug. 13-19, 2023; Arrest...",262
1,House Republicans Begin Investigating Willis a...,568
2,Wisconsin deer farm infected with fatal brain ...,1355


In [12]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(
    lambda x:
        ' '.join([
            stemmer.stem(word) for word in x.translate(
                str.maketrans('', '', string.punctuation) # remove punctuation
            ).split(' ')
        ])
)

combined.head(1)

Unnamed: 0,text,id,stemmed
0,"Mugshots of the week: Aug. 13-19, 2023; Arrest...",262,mugshot of the week aug 1319 2023 arrest were ...


In [13]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(336, 2604)

In [14]:
print(count_vect.vocabulary_.get(u'president'))

None


In [15]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(336, 2604)

In [16]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 48.0%


In [30]:
NUM_CLUSTERS = 60

In [31]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS,
    max_iter=100,
    n_init=1,

).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 6  6 11  9  6  4 11  4  6  8  5 12  8  6  6 11  7  3  4  4  5 10  4  5
  5  6  3  5  4  4  5  5  8  5  4  9  5  8  3  3  4  4  2  7  4  2  4  4
  4  7  3  8  5  2  6  3  5  6  6  7]


In [32]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: son murder charg mother buckley york attack tennesse gun coupl 
Cluster 1: gop candid debat come presid democraci illustr hand say parti 
Cluster 2: kill polic suspect shot offic texa man 17 alleg k9 
Cluster 3: debat republican presidenti candid issu rival abort 2024 parti clash 
Cluster 4: mcconnel senat mitch kentucki freez second episod leader incid doctor 
Cluster 5: case lawyer trump trial defens white presid team sadow 1931 
Cluster 6: biden presid impeach trump lake mr stori desir economi hunter 
Cluster 7: forc disrupt industri world flight american water chip turn heat 
Cluster 8: cross illeg border famili august soldier direct 91000 sinc step 
Cluster 9: ramaswami say vivek campaign rap voter trail song eminem polit 
Cluster 10: guantánamo court bomb case defend bali tortur cole uss attack 
Cluster 11: year prison sentenc massachusett man abus sexual texa 45 girl 
Cluster 12: campaign way trump politician calendar buckley trail accompani asterisk assess 
Cluster 1

In [33]:
data['group'] = kmeans.labels_

In [34]:
for cluster in range(NUM_CLUSTERS):
    print("-------------- CLUSTER #" + str(cluster))
    print(data.loc[data['group'] == cluster]['title'])

-------------- CLUSTER #0
18     She’s a Republican gun owner. Now she’s pleadi...
36     James L. Buckley, Conservative Senator in Libe...
70     Tennessee hiking couple found safe in Alaska a...
123    NYC hammer attack suspect charged with murder,...
207    LA fire captain's son charged with attempted r...
319    Mother and son grifters who were ‘like a coupl...
Name: title, dtype: object
-------------- CLUSTER #1
108    Most G.O.P. Candidates Say They Will Support T...
112    Ramaswamy and Haley show the GOP’s divergent p...
163    ‘Democracy’ was on the wall at the GOP debate....
241    Tenn. lawmakers refused to act on guns. A GOP ...
244    Is America ready for another impeachment? McCa...
309    Biden fights back against GOP onslaught on edu...
Name: title, dtype: object
-------------- CLUSTER #2
46     California store owner shot dead after alleged...
56     Texas man confesses to killing wife over 'jeal...
62     Marine killed in live-fire training accident a...
65     Kentuc