# Let's play with the embeddings here a bit

We will have a look at Bert embeddings here first. 

In [None]:
import torch
import pickle 
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizerFast, DistilBertModel

Loading the embeddings from file. <br>
The ones I'm working on here are the result of processing of only 500 randomly sampled documents, filtering out words that are not nouns, and only keeping vectors that have a similarity below 0.9. 

In [None]:
vocab_output_path = "vocab_etm"
embedding_output_path = "embedding_etm"
new_collection_output_path = "new_collection_etm"

In [None]:
# Loading from binary 
with open(vocab_output_path, "rb") as fp:  
    idx2word = pickle.load(fp)
with open(embedding_output_path, "rb") as fp: 
    embedding = pickle.load(fp)
with open(new_collection_output_path, "rb") as fp: 
    new_token_ids = pickle.load(fp)

In [None]:
word2idxs = [(word, [position for position, v in enumerate(list(idx2word.values())) if v == word]) for word in set(idx2word.values())]

In [None]:
word2idxs_df = pd.DataFrame(word2idxs, columns=["word","indices"])

In [None]:
word2idxs_df.head(10)

In [None]:
word2idxs_df["num_occurrences"] = word2idxs_df["indices"].apply(lambda x: len(x))
word2idxs_df.head()

In [None]:
word2idxs_sorted = word2idxs_df.sort_values(by=["num_occurrences"],ascending=False)
word2idxs_sorted.head(20)

In [None]:
word2idxs_sorted.tail(20)

In [None]:
word2idxs_sorted.iloc[297]

### And since we all like it, let's add some visualisation to it! 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(5, 3))
plt.subplot(2, 1, 1)
word2idxs_df["num_occurrences"].hist(bins=30,color='steelblue', edgecolor='black', linewidth=1.0,
           xlabelsize=8, ylabelsize=8, grid=False)   
plt.subplot(2, 1, 2)
word2idxs_df["num_occurrences"].hist(bins=100, density=True, cumulative=True, linewidth=1.0)   
plt.tight_layout(rect=(0, 0, 1.2, 1.2))  

In [None]:
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

Let's select one of the higly contextualised words to visualise

In [None]:
# all the embeddings for the word 'data'
data_emb = embedding[list(word2idxs_df[word2idxs_df["word"]=="data"]["indices"])[0],:].numpy()

In [None]:
data_emb_df = pd.DataFrame(data_emb)
data_emb_df["word"] = 0
data_emb_df.head() # each row is a different "meaning" of data

In [None]:
# all the embeddings for the word 'system', 'model', 'wave', 'user', 'light'
system_emb = embedding[list(word2idxs_df[word2idxs_df["word"]=="system"]["indices"])[0],:].numpy()
model_emb = embedding[list(word2idxs_df[word2idxs_df["word"]=="model"]["indices"])[0],:].numpy()
wave_emb = embedding[list(word2idxs_df[word2idxs_df["word"]=="wave"]["indices"])[0],:].numpy()
user_emb = embedding[list(word2idxs_df[word2idxs_df["word"]=="user"]["indices"])[0],:].numpy()
light_emb = embedding[list(word2idxs_df[word2idxs_df["word"]=="light"]["indices"])[0],:].numpy()
# and their respective dataframes 
system_emb_df = pd.DataFrame(system_emb)
system_emb_df["word"] = 1
model_emb_df = pd.DataFrame(model_emb)
model_emb_df["word"] = 2
wave_emb_df = pd.DataFrame(wave_emb)
wave_emb_df["word"] = 3
user_emb_df = pd.DataFrame(user_emb)
user_emb_df["word"] = 4
light_emb_df = pd.DataFrame(light_emb)
light_emb_df["word"] = 5

In [None]:
# now stacking all of them 
frequent_words_df = data_emb_df.append([system_emb_df,model_emb_df,wave_emb_df,user_emb_df,light_emb_df])

In [None]:
frequent_words_df.dropna()

In [None]:
# first PCA reduction 
pca = PCA(n_components=3)
pca_result = pca.fit_transform(frequent_words_df.dropna().values)

In [None]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

In [None]:
frequent_words_df['pca-one'] = pca_result[:,0]
frequent_words_df['pca-two'] = pca_result[:,1] 
frequent_words_df['pca-three'] = pca_result[:,2]

In [None]:
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
scatter = ax.scatter(
    xs=frequent_words_df["pca-one"], 
    ys=frequent_words_df["pca-two"], 
    zs=frequent_words_df["pca-three"], 
    c = frequent_words_df["word"],
    cmap='rainbow'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.legend(handles=scatter.legend_elements()[0], labels=[ 'wave', 'user', 'light'])

plt.show()

In [None]:
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
scatter = ax.scatter(
    xs=frequent_words_df["pca-one"], 
    ys=frequent_words_df["pca-two"], 
    zs=frequent_words_df["pca-three"], 
    c = frequent_words_df["word"],
    cmap='rainbow'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.legend(handles=scatter.legend_elements()[0], labels=[ 'wave', 'user', 'light'])

plt.show()

In [None]:
# plotting only the less frequent words 
ax = plt.figure(figsize=(16,10)).gca(projection='3d')
scatter = ax.scatter(
    xs=frequent_words_df[619:]["pca-one"], 
    ys=frequent_words_df[619:]["pca-two"], 
    zs=frequent_words_df[619:]["pca-three"], 
    c = frequent_words_df[619:]["word"],
    cmap='rainbow'
)
ax.set_xlabel('pca-one')
ax.set_ylabel('pca-two')
ax.set_zlabel('pca-three')
plt.legend(handles=scatter.legend_elements()[0], labels=[ 'wave', 'user', 'light'])

plt.show()