In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cluster import KMeans

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from nltk.stem.snowball import SnowballStemmer

import string

In [2]:
# read in data
DATA_FP = "datasets/MINDlarge_train/news.tsv"

data = pd.read_csv(DATA_FP, sep='\t')

In [3]:
# add column labels and combine title and abstract
data.columns = ['id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title entities', 'abstract entities']
data = data.fillna("")

# remove data from specific topics
filter = ~data['category'].isin(['sports'])
data = data[filter]

combined = pd.DataFrame(data["title"] + "; " + data["abstract"], columns=["text"])

In [4]:
# Stemming
stemmer = SnowballStemmer("english", ignore_stopwords=True)

combined['stemmed'] = combined['text'].apply(
    lambda x:
        ' '.join([
            stemmer.stem(word) for word in x.translate(
                str.maketrans('', '', string.punctuation) # remove punctuation
            ).split(' ')
        ])
)

combined.head(1)

Unnamed: 0,text,stemmed
0,Walmart Slashes Prices on Last-Generation iPad...,walmart slash price on lastgener ipad appl new...


In [5]:
# create bag of words
count_vect = CountVectorizer(
    stop_words="english",
    # ngram_range=(1,2) # count words and pairs of words
)
X_train_counts = count_vect.fit_transform(combined['stemmed'])
X_train_counts.shape

(69506, 60148)

In [6]:
print(count_vect.vocabulary_.get(u'president'))

None


In [7]:
# tfidf = Term Frequency times Inverse Document Frequency
# removes weighting based on text document length
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_tfidf.shape

(69506, 60148)

In [8]:
# perform lsa (reduce size of dataset)
lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

Explained variance of the SVD step: 12.1%


In [9]:
NUM_CLUSTERS = 100

In [10]:
# cluster (k-means)
kmeans = KMeans(
    n_clusters=NUM_CLUSTERS
    ,
    max_iter=100,
    n_init=1,
).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [ 775  538  471  615 1294  868  886  427  694  550  609 1477  706  361
  713  916  483  619 1081 1295  212  691  905 1686  762  750  594  399
  702  472  470  560  901  331  874  441  730  841  532  755  544  803
  733  620  951  764  378  540  653  664  355  390  783  399  836  959
 1192 1800  766  841  818  504  712  592 1105  713  528  729  506  955
  488  360  548 1061 1168  612  569 1095  177 1081  309  795  712  784
  348  620  820  622  730  698  261  477  454  346  770  647  327  460
  448  600]


In [11]:
# get top words in each cluster
original_space_centroids = lsa[0].inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = count_vect.get_feature_names_out()

for i in range(NUM_CLUSTERS):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :10]:
        print(f"{terms[ind]} ", end="")
    print()

Cluster 0: weather today forecast week wbz novemb rain latest morn histori 
Cluster 1: deal friday black best amazon sale buy offer appl earli 
Cluster 2: 2019 award star peopl music celebr countri best year novemb 
Cluster 3: trump ukrain presid investig impeach giuliani offici biden testifi testimoni 
Cluster 4: new york jersey orlean open work come look appl year 
Cluster 5: hous white trump pelosi presid speaker committe chief impeach meet 
Cluster 6: famili kill mexico american member children live son friend die 
Cluster 7: hotel rock collaps hard orlean construct site build new crane 
Cluster 8: million tax year plan compani near sale florida ohio project 
Cluster 9: florida sign south vote beach storm zodiac million tropic central 
Cluster 10: shoot dead injur shot suspect polic victim kill investig school 
Cluster 11: trump presid donald white ukrain say campaign ralli impeach said 
Cluster 12: power pge outag custom shutoff wind electr util california restor 
Cluster 13: fort