In [None]:
!pip install datasets umap-learn
import pandas as pd, os
from tqdm import tqdm
import numpy as np
import datasets

#Pandas Df
We will read train text into a Pandas df.

In [None]:
from datasets import load_dataset

dataset = load_dataset("SetFit/bbc-news")

train_text_df = pd.DataFrame({'text': dataset['train']['text']})
train_text_df.head()

#Tfidf
We will use Tfidf to convert each text into a embedding vector of length 25,000.

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = tfidf.fit_transform( train_text_df['text'] ).toarray()

# UMAP
We will use UMAP (Uniform Manifold Approximation and Projection for Dimension Reduction) to reduce embedding vectors to two dimensions

In [12]:
from umap import UMAP
umap = UMAP()
embed_2d = umap.fit_transform(text_embeddings)

# KMeans
We will use KMeans to find clusters of essays. These are the essay topics!

In [13]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(embed_2d)
train_text_df['cluster'] = kmeans.labels_

# Display Topics
We will display the result of UMAP which reduced text to two dimension.

In [None]:
import matplotlib.pyplot as plt

centers = kmeans.cluster_centers_

plt.figure(figsize=(10,10))
plt.scatter(embed_2d[:,0], embed_2d[:,1], s=1, c=kmeans.labels_)
plt.title('UMAP Plot of Train Text using Tfidf features\n',size=16)

for k in range(len(centers)):
    mm = np.mean( text_embeddings[train_text_df.cluster.values==k],axis=0 )
    ii = np.argmax(mm)
    top_word = list(tfidf.vocabulary_.keys())[list(tfidf.vocabulary_.values()).index(ii)]
    plt.text(centers[k,0]-1,centers[k,1]+0.75,f'{k+1}-{top_word}',size=16)

plt.show()
plt.savefig('clusters.png')

# Display Example Text
We will display three example text from each topic. And we will display the five most important words from each topic.

In [None]:
for k in range(5):
    mm = np.mean( text_embeddings[train_text_df.cluster.values==k],axis=0 )
    ii = ( np.argsort(mm)[-5:][::-1] )
    top_words = [list(tfidf.vocabulary_.keys())[list(tfidf.vocabulary_.values()).index(i)] for i in ii]
    print('#'*25)
    print(f'### Topic {k+1}')
    print('### Top 5 Words',top_words)
    print('#'*25)
    tmp = train_text_df.loc[train_text_df.cluster==k].sample(3, random_state=123)
    for j in range(3):
        txt = tmp.iloc[j,0]
        print('-'*10,f'Example {j+1}','-'*10)
        print(txt,'\n')