# Resources:
- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import HDBSCAN

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

import string

In [3]:
# read in data
TEST_FP = "datasets/MINDlarge_train/news.tsv"
SUPABASE_FP = "datasets/supabase_data/data.csv"

DATA_FP = SUPABASE_FP

if DATA_FP == TEST_FP:
    separator = '\t'
else:
    separator = ','

data = pd.read_csv(DATA_FP, sep=separator)

data.head(3)

Unnamed: 0,id,title,description
0,2681,"Vivek Ramaswamy wasn’t in Model UN, but he’s g...","As the GOP presidential candidates tussle, som..."
1,2713,U.S. Navy destroyer sports 'badass' pirate-ins...,The new Navy warship USS Carl M. Levin feature...
2,2703,Alabama shooting: 2 dead after gunshot victims...,At least two victims died and three others wer...


In [4]:
if DATA_FP == TEST_FP:
    # add column labels
    data.columns = ['id', 'category', 'subcategory', 'title', 'description', 'url', 'title entities', 'abstract entities']

    # remove data from specific topics
    filter = ~data['category'].isin(['sports'])
    data = data[filter]
    data = data.iloc[:10000]

elif DATA_FP == SUPABASE_FP:
    # add column labels
    data.columns = ['id', 'title', 'description']

data = data.fillna("")

combined = pd.DataFrame(data["title"] + "; " + data["description"], columns=["text"])
combined['id'] = data['id']
combined.head(3)

Unnamed: 0,text,id
0,"Vivek Ramaswamy wasn’t in Model UN, but he’s g...",2681
1,U.S. Navy destroyer sports 'badass' pirate-ins...,2713
2,Alabama shooting: 2 dead after gunshot victims...,2703


In [5]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(
    lambda x:
        ' '.join([
            stemmer.stem(word) for word in x.translate(
                str.maketrans('', '', string.punctuation) # remove punctuation
            ).split(' ')
        ])
)

combined.head(1)

Unnamed: 0,text,id,stemmed
0,"Vivek Ramaswamy wasn’t in Model UN, but he’s g...",2681,vivek ramaswami wasn't in model un but he got ...


In [6]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(792, 4323)

In [7]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(792, 4323)

In [8]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 31.3%


In [9]:
K_MEANS = 0
AGGLO = 1
_DBSCAN = 2
_HDBSCAN = 3

ALG = AGGLO

In [46]:
if ALG == K_MEANS:
    NUM_CLUSTERS = 60
    # cluster (k-means)
    clusterer = KMeans(
        n_clusters=NUM_CLUSTERS,
        max_iter=100,
        n_init=1,

    ).fit(X_lsa)
    cluster_ids, cluster_sizes = np.unique(clusterer.labels_, return_counts=True)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")

In [10]:
if ALG == AGGLO:
    # cluster (agglomerative / hierarchical)
    clusterer = AgglomerativeClustering(
        n_clusters=None,
        metric="cosine",
        linkage='average',
        distance_threshold=0.75
    ).fit(X_lsa)
    cluster_ids, cluster_sizes = np.unique(clusterer.labels_, return_counts=True)
    NUM_CLUSTERS = len(cluster_sizes)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 7 10 22  3  8 13  6  8 14  6 21  4 22  8 11  8  5  4  8 15 11 16  6  2
  5  4 11  6  9  3  7  7  7  4  6 10 10  4  3  9 10  9  6  4 15  5  7  9
 10  4  4  3 12  6  5  4  5  7  7  3 10  5 16  4  4 10  7  5  4 12  2  2
 14  3  9  5  5  2  5  2  3  6  3  3  7 12  4  3  6  5  2  2  2  5  2  5
  4  5  3  2  3 15  2  4  5  3  9  3  8  1  3  9  3  7  3  3  2  2  4  2
  4  1  5  3]


In [105]:
if ALG == _DBSCAN:
    # cluster (DBSCAN)
    clusterer = DBSCAN(
        eps=0.6,
        metric='cosine'
    ).fit(X_lsa)
    cluster_ids, cluster_sizes = np.unique(clusterer.labels_, return_counts=True)
    NUM_CLUSTERS = len(cluster_sizes)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [156 136   3  12   4   4   7   7   4   3]


In [111]:
if ALG == _HDBSCAN:
    # cluster (HDBSCAN)
    clusterer = HDBSCAN(
        metric='euclidean'
    ).fit(X_lsa)
    cluster_ids, cluster_sizes = np.unique(clusterer.labels_, return_counts=True)
    NUM_CLUSTERS = len(cluster_sizes)
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [251  10   5   7  27   5  13  18]


In [18]:
if ALG == K_MEANS:
    # get top words in each cluster (k-means only)
    original_space_centroids = lsa[0].inverse_transform(clusterer.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = count_vect.get_feature_names_out()

    for i in range(NUM_CLUSTERS):
        print(f"Cluster {i}: ", end="")
        for ind in order_centroids[i, :10]:
            print(f"{terms[ind]} ", end="")
        print()

AttributeError: 'AgglomerativeClustering' object has no attribute 'cluster_centers_'

In [11]:
data['group'] = clusterer.labels_

In [12]:
for cluster in range(NUM_CLUSTERS):
    print("-------------- CLUSTER #" + str(cluster))
    print(data.loc[data['group'] == cluster]['title'])

-------------- CLUSTER #0
186      Who Is Steve Sadow, Trump’s New Defense Lawyer?
192    Retired Army Lawyer Will Oversee Pentagon’s Wa...
382    Trump lawyers evoke 1931 trial of ‘Scottsboro ...
436    Plea Deal May Be Near for a Bali Bombing Defen...
571    Justice Alito Rejects Calls for Recusal After ...
604    Alito will not recuse in case involving lawyer...
719    Robert S. Bennett, Washington’s Go-to Lawyer i...
Name: title, dtype: object
-------------- CLUSTER #1
5      White House Urges Eligible Immigrants to Apply...
295    Democrats turn on Biden over border crisis, Hu...
521    'Woke' military policies' effect on recruitmen...
551    Squatters ravage Wyoming downtown with stomach...
563    DC begins enforcing curfew to deter youth crim...
575    Border Crisis Comes to Blue Cities After Migra...
633    NYC Mayor Adams warns migrant women and childr...
745    Lauch Faircloth Dies at 95; Senator Targeted D...
746    Dominican Republic Will Close Border With Hait...
784    La

In [13]:
data.to_csv('grouped_data/data.csv', index=False)