In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

import string

In [2]:
# read in data
DATA_FP = "datasets/MINDlarge_train/news.tsv"

data = pd.read_csv(DATA_FP, sep='\t')

In [3]:
# add column labels and combine title and abstract
data.columns = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title entities', 'abstract entities']
data = data.fillna("")

# remove data from specific topics
filter = ~data['category'].isin(['sports'])
data = data[filter]

combined = pd.DataFrame(data["title"] + "; " + data["abstract"], columns=["text"])
combined['id'] = data['id']
combined.head(3)

Unnamed: 0,text,id
0,Walmart Slashes Prices on Last-Generation iPad...,N45436
1,50 Worst Habits For Belly Fat; These seemingly...,N23144
2,Dispose of unwanted prescription drugs during ...,N86255


In [4]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(
    lambda x:
        ' '.join([
            stemmer.stem(word) for word in x.translate(
                str.maketrans('', '', string.punctuation) # remove punctuation
            ).split(' ')
        ])
)

combined.head(1)

Unnamed: 0,text,id,stemmed
0,Walmart Slashes Prices on Last-Generation iPad...,N45436,walmart slash price on lastgener ipad appl new...


In [5]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(69506, 60148)

In [6]:
print(count_vect.vocabulary_.get(u'president'))

None


In [7]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(69506, 60148)

In [8]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 12.1%


In [9]:
NUM_CLUSTERS = 1000

In [10]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS,
    max_iter=100,
    n_init=1,

).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 72  27  53  24  10  12  42  38  45  51  29  64  38  36 271  83  78 351
  80  48  54  43  68  30  46  97  87 116  29 136 183 115  43 146  40 203
  30  51  48 106  22 211 154 114  36  24 255  73  66 100  86  69  35  96
  28  57 272 318  11  50  97  43  68  67  86  58 385  51  37  99  34 148
  84  22  56   8  55  42  25  64  29 255  95  84  80  52  86  55  87 118
  50  74  29  59 117 221  74 139  24  20  22  60  59  45  17  25  34  31
  18 211 214 124 126  51  31 108  69 157  55  40  44  23 131  84  62  29
  31  25  41  48  56  71  28  40 241  54  26  15  37  85  63  63  63  38
 120  50  40 162 100  52  77  25  35  85 207  86  82  20  65  58  69  25
  36 105 102  24  86  40  57  67  97  33  94 100  23  64  31  47  82 207
 144  59  21  29 112  40 100 218  73  61 222 133  55  49  22  87  45 103
  23 119  53   4  67  39  85  48  87 163  40  60  74  90  40  68  96  55
 107 182  46  55  82  90  70  15  46 141  35  65 214  47  16  52  15 202
  61  

In [11]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: weather storm forecast rain wind today tropic expect weekend snow 
Cluster 1: polic year offic said depart say investig retir chief week 
Cluster 2: cancer women help american children breast chang like mexico drug 
Cluster 3: impeach week hear inquiri trump public presid republican democrat ukrain 
Cluster 4: crash nation plane year near hospit new servic involv airport 
Cluster 5: year month video store protest appl cancer chang just time 
Cluster 6: citi tax million council mayor year plan vote kansa counti 
Cluster 7: news trend reader articl local headlin recent link like austin 
Cluster 8: make water use way compani chang like work want help 
Cluster 9: school close high district road student counti lane store closur 
Cluster 10: polic home shot said say shoot street neighborhood offic man 
Cluster 11: restaur open said year food just bar know busi time 
Cluster 12: report appl googl app iphon pixel phone releas new road 
Cluster 13: compani million plan microsoft work

In [12]:
data['group'] = kmeans.labels_

In [13]:
for cluster in range(NUM_CLUSTERS):
    print("-------------- CLUSTER #" + str(cluster))
    print(data.loc[data['group'] == cluster]['title'])

-------------- CLUSTER #0
323      Mostly dry Friday and Saturday before heavy ra...
2693     Central Pennsylvania weather: After rain ends ...
4103     Baseball themed forecast: strong storms Monday...
8200     Sunny skies, cool nights in the forecast; subt...
9431     Thunderstorms with damaging winds, large hail ...
                               ...                        
75873    UK flooding: army to help recovery effort as m...
78361    First Alert Weather: Tracking the Next Cold Fr...
79790    First Alert Weather: Tracking rain, even a few...
89408    Wind Thwarts Weather Balloon Launch, to Meteor...
96048    Join us Dec. 3 for a town hall meeting on the ...
Name: title, Length: 72, dtype: object
-------------- CLUSTER #1
7886     Lt. Col. Philbin, third-ranking officer on R.I...
24018    Chief announces LMPD reorganization plans: 'We...
24994    George Gascón was a progressive DA in progress...
25888    Commissioner addresses reducing Baltimore poli...
26248    Without sales t