# Resources:
- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

import string

In [2]:
# read in data
TEST_FP = "datasets/MINDlarge_train/news.tsv"
SUPABASE_FP = "datasets/supabase_data/data.csv"

DATA_FP = SUPABASE_FP

if DATA_FP == TEST_FP:
    separator = '\t'
else:
    separator = ','

data = pd.read_csv(DATA_FP, sep=separator)

data.head(3)

Unnamed: 0,id,title,abstract
0,262,"Mugshots of the week: Aug. 13-19, 2023",Arrests were made and mugshots were taken thro...
1,568,House Republicans Begin Investigating Willis a...,"The launch of an inquiry into Fani Willis, the..."
2,1355,Wisconsin deer farm infected with fatal brain ...,"A 150-acre deer farm in Washburn County, Wisco..."


In [3]:
if DATA_FP == TEST_FP:
    # add column labels
    data.columns = ['id', 'category', 'subcategory', 'title', 'description', 'url', 'title entities', 'abstract entities']

    # remove data from specific topics
    filter = ~data['category'].isin(['sports'])
    data = data[filter]
    data = data.iloc[:10000]

elif DATA_FP == SUPABASE_FP:
    # add column labels
    data.columns = ['id', 'title', 'description']

data = data.fillna("")

combined = pd.DataFrame(data["title"] + "; " + data["description"], columns=["text"])
combined['id'] = data['id']
combined.head(3)

Unnamed: 0,text,id
0,"Mugshots of the week: Aug. 13-19, 2023; Arrest...",262
1,House Republicans Begin Investigating Willis a...,568
2,Wisconsin deer farm infected with fatal brain ...,1355


In [4]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(
    lambda x:
        ' '.join([
            stemmer.stem(word) for word in x.translate(
                str.maketrans('', '', string.punctuation) # remove punctuation
            ).split(' ')
        ])
)

combined.head(1)

Unnamed: 0,text,id,stemmed
0,"Mugshots of the week: Aug. 13-19, 2023; Arrest...",262,mugshot of the week aug 1319 2023 arrest were ...


In [5]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(336, 2604)

In [6]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(336, 2604)

In [8]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 47.9%


In [9]:
NUM_CLUSTERS = 60
# cluster (k-means)
clusterer = KMeans(
    n_clusters=NUM_CLUSTERS,
    max_iter=100,
    n_init=1,

).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(clusterer.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 4 12  8  4  3  6  7  8  7  9  5  7  5  5  6  7  3  6  7 10  9  6  4 10
  4  5  6 11  6  3  8  4  5  4  5  6  4  2  5  8  5  2  5  5  6  3  6  6
  5  4  4  5  5  5  6  3  3  6  6  2]


In [17]:
# cluster (agglomerative / hierarchical)
clusterer = AgglomerativeClustering(
    n_clusters=None,
    metric="cosine",
    linkage='average',
    distance_threshold=0.7
).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(clusterer.labels_, return_counts=True)
NUM_CLUSTERS = len(cluster_sizes)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 3  5  2  2  9 11  3  2  4  3  4  3  3  3  4  2  3  2  4  6  2  4  4  2
  2  3  4  3  2  3  3  2  2  8  3  2  4  6  2  4  3  2  2  2  2  2  2  3
  3  5  4  3  2  2  3  2  2  2  2  3  1  2  2  2  3  2  3  7  6  2  3  2
  3  2  3  3  2  2  4  2  2  2  2  2  3  2  3  2  3  1  2  3  4  2  1  4
  2  3  2  2  3  3  2  1  2  1  2  1  3  1  2  1  1  1  1  1  1  1  1  3
  2  1  1  1  1  1]


In [18]:
# get top words in each cluster (k-means only)
original_space_centroids = lsa[0].inverse_transform(clusterer.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

AttributeError: 'AgglomerativeClustering' object has no attribute 'cluster_centers_'

In [19]:
data['group'] = clusterer.labels_

In [20]:
for cluster in range(NUM_CLUSTERS):
    print("-------------- CLUSTER #" + str(cluster))
    print(data.loc[data['group'] == cluster]['title'])

-------------- CLUSTER #0
11     Trump says he did ‘nothing wrong.’ Even Republ...
97     Vivek Ramaswamy Is Happy to Be Talked About, E...
260      For Politicians, Vacations Can Be a Lot of Work
Name: title, dtype: object
-------------- CLUSTER #1
14     FBI arrests officers who allegedly used police...
53     Atlanta man arrested for impersonating police ...
202    Georgia man arrested after 'Ding Dong Ditch' p...
204    Man who killed, dismembered roommate after fig...
206    Florida dad arrested for 2021 manslaughter of ...
Name: title, dtype: object
-------------- CLUSTER #2
42    Gilgo Beach murder victims forgotten during ye...
55    New York City shark attack victim had 5 surger...
Name: title, dtype: object
-------------- CLUSTER #3
45     Kansas reporter accused of breaking law before...
333    Alleged drunk driver accidentally reports hims...
Name: title, dtype: object
-------------- CLUSTER #4
8      Trump to release taped interview with Tucker C...
26     Trump May Skip S