In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [2]:
# read in data
DATA_FP = "datasets/MINDlarge_train/news.tsv"

data = pd.read_csv(DATA_FP, sep='\t')

In [3]:
# add column labels and combine title and abstract
data.columns = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title entities', 'abstract entities']
data = data.fillna("")
combined = pd.DataFrame(data["title"] + ";" + data["abstract"], columns=["text"])

In [4]:
# create bag of words
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(combined['text'])
X_train_counts.shape

(101526, 73970)

In [5]:
print(count_vect.vocabulary_.get(u'president'))

52036


In [6]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(101526, 73970)

In [7]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 9.7%


In [8]:
NUM_CLUSTERS = 10

In [9]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS
    ,
    max_iter=100,
    n_init=1,
).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [13131 26021  5639  6832  5350  3553  4822 22607  7807  5764]


In [10]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: make just like best people time know food thanksgiving years 
Cluster 1: city says state said million news years 2020 day 2019 
Cluster 2: police man said shooting say arrested woman shot officer officers 
Cluster 3: new york orleans jersey city restaurant says years england just 
Cluster 4: trump president impeachment donald house inquiry ukraine democrats white says 
Cluster 5: school high students student schools football district bus county board 
Cluster 6: year old girl boy missing said years family says million 
Cluster 7: game season team win football coach night sunday vs play 
Cluster 8: county crash california said road near people morning officials according 
Cluster 9: week weather snow forecast cold today rain temperatures 10 winter 
