In [10]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

In [29]:
# read in data
DATA_FP = "datasets/MINDlarge_train/news.tsv"

data = pd.read_csv(DATA_FP, sep='\t')

In [30]:
# add column labels and combine title and abstract
data.columns = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title entities', 'abstract entities']
data = data.fillna("")
combined = pd.DataFrame(data["title"] + ";" + data["abstract"], columns=["text"])

In [31]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split(' ')]))

combined.head(1)

Unnamed: 0,text,stemmed
0,Walmart Slashes Prices on Last-Generation iPad...,walmart slash price on last-gener ipads;appl n...


In [46]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(101526, 70975)

In [40]:
print(count_vect.vocabulary_.get(u'president'))

1046825


In [41]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(101526, 1537240)

In [42]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 4.3%


In [43]:
NUM_CLUSTERS = 100

In [44]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS
    ,
    max_iter=100,
    n_init=1,
).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 595 2468 1342  261 1306  767  624 1892  772  703  959 1077  579 1623
 1348 1178  520 1276 2038 1393  484  523  647 1129  613 1115 1008 1497
 1916 1410  836 1046  408  363  978 1231 1463 2028  791  806  650  955
 1810  391  687 1275  868  945 1505  954 1011  964 1571  891  952  845
 1301 2029  922  698  647 1001 1262  709  498  749 1531  661  814 1148
  656 1135  424  879  797  964 1077 1407  779 1230  339 1347 1274 1536
 1424  815  831 1127 1288  544 1272  389  624  950  503 1657 1013  238
  666 1084]


In [45]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: sign zodiac zodiac sign said newslett sign newslett free player make deal 
Cluster 1: year old year old girl boy old girl miss said old boy famili 
Cluster 2: play star tour 2020 player year big th wood tiger 
Cluster 3: neighborhood popular neighborhood brows brows spot brows popular popular local local businesses know businesses ratings 
Cluster 4: night monday game monday night friday win point score loss season 
Cluster 5: crash driver car polic hit kill said truck run road 
Cluster 6: trump presid impeach ukrain presid trump investig say biden donald trump donald 
Cluster 7: million year plan compani 000 th peopl announc new report 
Cluster 8: airport flight plane intern intern airport airlin land air make chicago 
Cluster 9: nfl team week kaepernick quarterback colin colin kaepernick nfl week trade bengal 
Cluster 10: health offici said th peopl cancer say flu death help 
Cluster 11: patriot england new england bradi tom tom bradi england patriot new belichick gordon 
