In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

import string

In [2]:
# read in data
DATA_FP = "datasets/MINDlarge_train/news.tsv"

data = pd.read_csv(DATA_FP, sep='\t')

In [3]:
# add column labels and combine title and abstract
data.columns = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title entities', 'abstract entities']
data = data.fillna("")
combined = pd.DataFrame(data["title"] + "; " + data["abstract"], columns=["text"])

In [4]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(
    lambda x:
        ' '.join([
            stemmer.stem(word) for word in x.translate(
                str.maketrans('', '', string.punctuation) # remove punctuation
            ).split(' ')
        ])
)

combined.head(1)

Unnamed: 0,text,stemmed
0,Walmart Slashes Prices on Last-Generation iPad...,walmart slash price on lastgener ipad appl new...


In [5]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(101526, 73623)

In [6]:
print(count_vect.vocabulary_.get(u'president'))

None


In [7]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(101526, 73623)

In [8]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 11.0%


In [9]:
NUM_CLUSTERS = 100

In [10]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS
    ,
    max_iter=100,
    n_init=1,
).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 947 1025 1450 1759  952 1216  499  749 1061  832  643  628 1002 1373
 1200  823 1897  903  710 1422 1013  694 1524 1294 1653  593  598  614
 1346 1856  929 2233 1067 1178 1424  652 1064  747  555  591  756 1606
 1412  440 1592 1033  757 1283 2089  859  496 1457  805 1192  790  913
 1262 1546  936  822  636  904  817  405  392  399  972  995 1244 1886
  669  822  590  812  816  751 1157 1313  473 1294 1110  583  785  565
  839  828  700 1312 1058 1009 1234 1035 1610 1142  934  906  548 1224
  836 1159]


In [11]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: halloween costum candi celebr parti trickortreat treat kid dress famili 
Cluster 1: 2019 octob award novemb season sport year star oct news 
Cluster 2: compani deal appl friday store new googl watch peopl million 
Cluster 3: like just fan star want famili play said thing come 
Cluster 4: close road store lane near street traffic water bridg closur 
Cluster 5: famili kill miss help die children say sign live son 
Cluster 6: event look week ticket time pick check weekend date locat 
Cluster 7: chicago teacher strike bear union deal public school mayor end 
Cluster 8: hous white trump impeach pelosi democrat presid committe speaker vote 
Cluster 9: school footbal high playoff team week game score friday night 
Cluster 10: nation washington seri world secur servic redskin fan cardin dc 
Cluster 11: woman said charg polic car fort worth miss arrest dog 
Cluster 12: time save year chang end daylight come just want event 
Cluster 13: counti sheriff offic deputi said road offici acc