In [45]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [46]:
# read in data
DATA_FP = "datasets/MINDlarge_train/news.tsv"

data = pd.read_csv(DATA_FP, sep='\t')

In [47]:
# add column labels and combine title and abstract
data.columns = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title entities', 'abstract entities']
data = data.fillna("")
combined = pd.DataFrame(data["title"] + ";" + data["abstract"], columns=["text"])

In [48]:
# create bag of words
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(combined['text'])
X_train_counts.shape

(101526, 73970)

In [49]:
print(count_vect.vocabulary_.get(u'president'))

52036


In [50]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(101526, 73970)

In [None]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

In [None]:
NUM_CLUSTERS = 10

In [43]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS
    ,
    max_iter=100,
    n_init=1,
).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [15815  6591 20386 18446 12362  5368  8084  5728  4013  4733]


In [44]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: city county said million people years 000 says officials california 
Cluster 1: new york orleans jersey city restaurant 2020 years open england 
Cluster 2: school 2019 world best day 2020 series news high halloween 
Cluster 3: game season team win night coach football nfl play injury 
Cluster 4: make just like know time family people years need thanksgiving 
Cluster 5: trump president impeachment donald house inquiry ukraine democrats white says 
Cluster 6: police man crash said car woman county shooting say killed 
Cluster 7: week weather snow forecast cold today temperatures 10 football rain 
Cluster 8: state penn michigan ohio iowa football florida oklahoma saturday said 
Cluster 9: year old girl boy missing said years family killed charged 
