# Example of TF-IDF clustering

## Basic intuition

In [None]:
a = "purple is the best city in the forest".split()
b = "there is an art to getting your way and throwing bananas on to the street is not it".split()
c = "it is not often you find soggy bananas on the street".split()

In [None]:
import numpy as np

# we'll merge all docs into a list of lists for easier calculations below
docs = [a, b, c]

def tf_idf(word, sentence):
    # term frequency
    tf = sentence.count(word) / len(sentence)
    print(f'TF={tf}')
    # inverse document frequency
    idf = np.log10(len(docs) / sum([1 for doc in docs if word in doc]))
    print(f'IDF={idf}')
    print(f'TFIDF={round(tf*idf, 4)}')

In [None]:
tf_idf('forest', a)


In [None]:
tf_idf('forest', b)

# 1.  Load and process text (for a simplified toy dataset)

In [None]:
# !pip install wikipedia
# import wikipedia

In [None]:
# wikipedia.set_lang("en")
# person = wikipedia.page("steven spielberg")
# print(person.content[:100])

In [None]:
# people = ["michael jordan", "robert lewandowski", "dwight eisenhower", "woodrow wilson", "steven spielberg"]
# dat = [[person, wikipedia.page(person).content] for person in people]

In [None]:
!pip install -q wordcloud
import wordcloud

import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.manifold import MDS

pd.set_option("display.precision", 4)


dat = [
        ['X', 'aaa aaa'],
        ['Y', 'aaa bbb bbb'],
        ['Y', 'aaa bbb bbb ddd'],
       ['Z', 'aaa bbb ccc eee'],
       ['Z', 'aaa bbb ccc eee fff']
     ]



df_sentences = pd.DataFrame(dat, columns=['Label', 'Sentence'])

labels = df_sentences['Label']
print("labels:")
print(labels)

n_clusters = np.unique(labels).shape[0]
print("n_clusters:", n_clusters)

In [None]:
# n_clusters = 3

## 2. Vectorize text to a numeric matrix      

In [None]:
sentences = df_sentences['Sentence'].values.tolist()
vocab=None
min_df=0.0
max_df=1.0
ngram_range=(1,1)


# Build count vectorizer
count_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, vocabulary=vocab, ngram_range=(1,1)) 
cvec = count_vectorizer.fit(sentences)

# Get feature names
feature_names = cvec.get_feature_names_out()

# Get bag-of-words and analyze
bag_of_words = cvec.transform(sentences)
df_bag_of_words = pd.DataFrame(bag_of_words.todense(), columns=feature_names)

# Transform bag_of_words into tf-idf matrix
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(bag_of_words)

# Find most popular words and highest weights
word_cnts = np.asarray(bag_of_words.sum(axis=0)).ravel().tolist()  # for each word in column, sum all row counts
df_cnts = pd.DataFrame({'word': feature_names, 'count': word_cnts})
df_cnts = df_cnts.sort_values('count', ascending=False)

# Build word weights as a list and sort them
weights = np.asarray(tfidf.mean(axis=0)).ravel().tolist()
df_weights = pd.DataFrame({'word': feature_names, 'weight': weights})
df_weights = df_weights.sort_values('weight', ascending=False)

df_weights = df_weights.merge(df_cnts, on='word', how='left')
df_weights = df_weights[['word', 'count', 'weight']]

# Cosine similarity of sentences
cos_sim = cosine_similarity(tfidf, tfidf)

# Distance matrix of sentences
samp_dist = 1 - cos_sim

  
# Build

df_tfidf = pd.DataFrame(tfidf.todense(), columns=feature_names)
print("%d dummy sentences:" % len(sentences))
print(sentences)
print("---")


print("%d feature_names (each feature represents a distinct word):" % len(feature_names))
print(feature_names)
print("---")
print("df_tfidf[%d,%d]:" % (len(sentences), len(feature_names)))
print(df_tfidf.to_string())
print("---")
print("df_weights:")
print(df_weights)
print("---")
print("cos_sim[%d,%d] (a square matrix of length and width = len(sentences)):" % (len(sentences), len(sentences)))
print(cos_sim)

**Note how the above cosine similarity matrix corresponds to the original bag-of-words representation shown below:  **

* Row 0 cosine similarity values are not similar to rows 1 to 4  
* Rows 1 and 2 contain similar values (they are not identical due to the extra word in row 2: 'ddd')
* Rows 3 and 4 contain identical values (with columns 3 and 4 shifted)

## Bag-of-words
The bag-of-words representation will usually be sparser than this one (i.e. lots of zero values) since each sentence contains only a few of the words from the entire corpus

In [None]:
print("df_bag_of_words[%d,%d]:" % (len(sentences), len(feature_names)))
print(df_bag_of_words)

## Build a word cloud from the weighted word counts

In [None]:
print(df_weights)
s_word_freq = pd.Series(df_weights['count'])
s_word_freq.index = df_weights['word']
di_word_freq = s_word_freq.to_dict()

print("---")
print("di_word_freq:")
for k,v in di_word_freq.items():
  print(k,v)

cloud = wordcloud.WordCloud(width=900, height=500).generate_from_frequencies(di_word_freq)
plt.imshow(cloud)
plt.axis('off')
plt.show()

## 3. Dimensionality Reduction using PCA

Before attempting to cluster the data, we will usually want to reduce the dimensionality of the data because this helps to mitigate the problem of overfitting. Note the distinction between the two terms:

* Dimensionality reduction: find the linear combinations of variables that are most 'interesting' in the data. For example, the polular PCA technique finds linear transformations of input features that maximize the variance of the data points along the new axes.

* Clustering: find data points that can be grouped together as separate classes.

In [None]:
# Dimensionality reduction using PCA, reduce the tfidf matrix to just 2 features
X = tfidf.todense()
print("X before reduction:")
print(X)
print('\n\n')

X = np.array(X)
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)

print("X_pca now has just 2 columns:")
print(X_pca)

##4. Calculate K-means clusters (unsupervised classification) 

In [None]:
km_model = KMeans(n_clusters=n_clusters, max_iter=10, n_init=2, random_state=121)

# K-means (from number of features in input matrix to n_clusters)
km_model.fit(X_pca)
df_centers = pd.DataFrame(km_model.cluster_centers_, columns=['x', 'y'])

plt.figure(figsize=(4,4))
plt.suptitle('PCA features colored by class; grey circles show the k-means centers')
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=km_model.labels_, s=50, cmap='jet')
plt.scatter(df_centers['x'], df_centers['y'], c='grey', s=500, alpha=0.2);

dy = 0.04
for i, txt in enumerate(km_model.labels_):
    my_label = df_sentences.iloc[i]['Label']
    plt.annotate(my_label, (X_pca[i, 0], X_pca[i, 1] + dy))


In [None]:
print("km_model.labels_:", km_model.labels_)
print("This corresponds to the sentence labels shown below as follows:")
print(df_sentences['Label'].tolist())
print("---")
print("df_centers:")
print(df_centers)

**Note above how the center coordinates for the k-means model correspond to the original sentences shown below. The dimensions of the centers are: (n_clusters, n_features). Each row corresponds to one of the sentences. We often run PCA as the first step and therefore end up with 2 remaining features (n_features = 2). **

In [None]:
print(df_sentences)

## Summary

The following data science techniques were demonstrated in the context of NLP (Natural Language Processing) using python's nltk library:

* Vectorize text to a numeric matrix using TF-IDF

* Dimensionality Reduction using PCA

* Calculate K-means clusters (unsupervised classification) 