# Explore & Visualise Results

### Settings

#### Libraries

In [None]:
from collections import Counter
import scipy.io 
import matplotlib.pyplot as plt 
import numpy as np
import os
from os.path import join
import pandas as pd
import pickle 
from sklearn.manifold import TSNE
from wordcloud import WordCloud

#### Set Options

In [None]:
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 100)
plt.close()
plt.interactive(True)

In [None]:
use_colab = True

#### Directories

In [None]:
if use_colab:
    from google.colab import drive
    drive.mount('/content/drive', force_remount= True)
    %cd '/content/drive/My Drive/Thesis/Topic-Modeling/'
else:
    os.chdir('/Users/M/Google_Drive/Thesis/Topic-Modeling')

In [None]:
data_dir_final = 'Data/Technology-Data/processed/final/'
emb_dir = 'Data/Embeddings/Word2Vec/Word2Vec_200.txt'
results_dir = 'Results/'
version = 'V15/DETM_V15_Exec_17-12-2020_09h17m'

### Explorative WordClouds

In [None]:
pseudotexts = pd.read_csv('Data/Technology-Data/processed/preprocessed/pseudotext_wordClouds.csv', sep=",", index_col=0)

In [None]:
fig = plt.figure(figsize=(25,6))
ax = []

i=1
for idx,row in pseudotexts.iterrows():
    if idx in [0,4,9]:
        words = row['words']
        ax.append(fig.add_subplot(1,3,i))
        ax[-1].set_title(row['time'], fontsize=20, y=-0.15)
        wc = WordCloud(background_color='white', width=3200, height=1800, collocations=False,contour_color='black').generate(words)
        ax[-1].imshow(wc, interpolation='bilinear') 
        ax[-1].axis('off')
        i+=1
fig.savefig('Results/wordClouds.png')

### Get Data and Results

#### Get Times

In [None]:
with open(data_dir_final + 'grouped_years/min_df_50/timestamps.pkl', 'rb') as f:
    timelist = pickle.load(f)
print('timelist: ', timelist)
T = len(timelist)

#### Get Vocab and Embeddings

In [None]:
from utils import get_data

In [None]:
data_file = data_dir_final + 'grouped_years/min_df_50'
vocab, full, train, valid, test = get_data(data_file)
vocab_size = len(vocab)

In [None]:
vocab_size = len(vocab)
vectors = {}
with open(emb_dir, 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        if word in vocab:
            vect = np.array(line[1:]).astype(np.float)
            vectors[word] = vect
            
embeddings = np.zeros((vocab_size, 200))
words_not_found = []
for i, word in enumerate(vocab):
    try: 
        embeddings[i] = vectors[word]
    except KeyError:
        embeddings[i] = np.random.normal(scale=0.6, size=(150, ))
        words_not_found.append(word)

#### Get Topic Vectors (Alpha)

In [None]:
alpha_file = os.path.join(results_dir,version+'_alpha.mat')
alpha = scipy.io.loadmat(alpha_file)['values']
print('alpha (topics,times,emb-dim): ', alpha.shape)

As expected, correlation is higher for consecutive years:

In [None]:
np.corrcoef(alpha[6,0,:],alpha[6,9,:])

In [None]:
np.corrcoef(alpha[6,8,:],alpha[6,9,:])

#### Get Topic Words (Beta)

In [None]:
beta_file = os.path.join(results_dir, version+'_beta.mat')
beta = scipy.io.loadmat(beta_file)['values']
print('beta (topics,times,vocab): ', beta.shape)

### Explore Topics (Topic Labels, Beta)

In [None]:
topic_labels = {}
t_id = 0
with open(results_dir + version + '_topic_labels.txt', 'rb') as f:
    for l in f.read().splitlines():
        t_label = l.decode()
        topic_labels[t_id] = t_label
        #print('{}: {}'.format(t_id, t_label))
        t_id += 1

In [None]:
num_words = 10
times = range(len(timelist))
num_topics = beta.shape[0]
for k in range(num_topics):
    print('\n')
    print('Topic {}: "{}"'.format(k,topic_labels[k]))
    for t in times:
        gamma = beta[k, t, :]
        top_words = list(gamma.argsort()[-num_words:][::-1])
        topic_words = [vocab[a] for a in top_words]
        print('...{}: {}'.format(timelist[t], topic_words)) 

### Coverage (Topic Proportions - Theta)

#### Get Topic Proportions (Theta) and Topic Proportion Averages for Each Time Slice
These are based on the entire dataset

In [None]:
theta_file = os.path.join(results_dir, version+'_theta.mat')
theta = scipy.io.loadmat(theta_file)['values']
print('topic proportions ("theta") full dataset (docs,topics): ', theta.shape)

In [None]:
props = pd.read_csv(results_dir + version + '_theta_avg.csv', index_col='time')
props.columns = [" ".join(t) for t in zip(props.columns, topic_labels.values())]
print('avg topic proportions (times,topics): ', props.shape)

#### WordCloud for tech-related Topics

In [None]:
props_avg = props.mean(axis=0)

In [None]:
num_topics
topics_pseudotext = {}
for i in range(num_topics):
    # exclude non-tech topics for this visualisation
    if i not in [2,7,9,20,26,30,39,40,45,52,53,54,57,58,65,69]:
        topics_pseudotext[topic_labels[i]] = int(props_avg[i]*10000)

In [None]:
from wordcloud import WordCloud
wc = WordCloud(background_color='white',width=7000, height=2000).generate_from_frequencies(topics_pseudotext)
wc.to_image()
plt.close()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
wc.to_file('Results/wordCloud_topics.png');

#### Coverage Ranking

In [None]:
props_avg.sort_values(inplace=True, ascending=False)
topic_ranking = props_avg.index
topic_ranking = [int(t.split()[0].replace('Topic-','')) for t in topic_ranking]
print('Top 25 Topics:')
print(np.array([str(t) + ' ' + topic_labels[t].replace('\n','') for idx, t in enumerate(topic_ranking)][:25]))

#### Topic Proportions over Time for each Topic

In [None]:
beg = 0
end = 5
while end <= num_topics:
    legend = []
    for i in props.iloc[:,beg:end]:
        legend.append(i)
    ax = props.iloc[:,beg:end].plot()
    ax.legend(legend, frameon=True, loc='upper right', bbox_to_anchor=(1.7, 1))
    ax.set_xticks(np.arange(T)[1::2])
    ax.set_xticklabels(timelist[1::2])
    ax.set_xlabel('time')
    ax.set_ylabel('proportions')
    ax.set_title('Topics {} - {}'.format(beg,end-1), fontsize=12);
    beg += 5
    end += 5

#### Topic Proportions over Time for Selected Topics

In [None]:
topics_pop = [[31,34,38],[44,60,73]]
for idx, topics in enumerate(topics_pop):
    popularity_plot = props.iloc[:,topics].plot(figsize=(8,5),style=['g-.','b--','r-','k:'],linewidth=2, marker='o', markersize=4)
    popularity_plot.legend(list(props.columns[[topics]]), frameon=True, fontsize=14)
    popularity_plot.set_xticks(np.arange(T)[0::3])
    popularity_plot.set_xticklabels(timelist[0::3], fontsize=14)
    popularity_plot.set_xlabel('time', fontsize=14, labelpad=5)
    popularity_plot.set_ylabel('average topic proportions', fontsize=14, labelpad=5)
    plt.savefig('Results/Topic_Popularities_{}.png'.format(idx))

### Word Use Evolution (Beta) for Selected Topics

In [None]:
topic_words = {11:['business','world','social_media'],
               34:['children','Facebook','social_media'],
               72:['mobile','4G','5G'],
               44:['VoIP','Skype','Siri','Alexa'],
               31:['artificial_intelligence','AI','chess','bots'],
               32:['banking','Bitcoin','blockchain','cryptocurrency'],
               61:['self-driving','Uber','safety'],
               73:['Nasa','moon','SpaceX','Elon_Musk'],
               38:['CD','MP3','iPod','streaming'],
               46:['e-commerce','Kindle','delivery','Fire'],
               12:['facial_recognition','digital','3D','smartphone'],
               68:['laser','3D_printing','paper','objects'],
               }

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(18, 20), dpi=80, facecolor='w', edgecolor='k')
ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,ax11,ax12 = axes.flatten()
axes = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,ax11,ax12]
colors = ['#41ab5d','#08589e','#ca0020','#ff7f00']
linestyles = ['-','--','-.',':']
linewidths = [1.8,2,2,2.5]

for idx, a in enumerate(topic_words):
    words = topic_words[a]
    tokens = [vocab.index(w) for w in words]
    betas = [beta[a, :, x] for x in tokens]
    ax = axes[idx]
    for i, comp in enumerate(betas):
        ax.plot(range(T), comp, label=words[i].replace('_',' '), linestyle=linestyles[i], lw=linewidths[i], color=colors[i], marker='o', markersize=5)
    ax.legend(frameon=True, loc='best', fontsize=12)
    ax.set_xticks(np.arange(T)[0::3])
    ax.set_xticklabels(timelist[0::3],fontsize=12)
    ax.set_title('Topic {} - {}'.format(a, topic_labels[a]), fontsize=14);
fig.subplots_adjust(hspace=0.25)
plt.savefig('Results/Topic_Evolutions.png')

### Explore Topic Vectors (Alpha) in Embedding Space

In [None]:
def nearest_neighbors_from_vector(vector, embeddings, vocab, num_words):
    embeddings = embeddings
    ranks = embeddings.dot(vector).squeeze()
    denom = vector.T.dot(vector).squeeze()
    denom = denom * np.sum(embeddings**2, 1)
    denom = np.sqrt(denom)
    ranks = ranks / denom
    mostSimilar = []
    [mostSimilar.append(idx) for idx in ranks.argsort()[::-1]]
    nearest_neighbors = mostSimilar[:num_words]
    nearest_neighbors = [vocab[comp] for comp in nearest_neighbors]
    return nearest_neighbors, ranks

In [None]:
def plot_alpha(alpha, selected_topics, time_slice, perplexity):
    embs = []
    annotation = []
    for topic in selected_topics:
        embs.append(alpha[topic,time_slice,:])
        annotation.append('Topic-{} {}'.format(topic,topic_labels[topic]))
    embs = np.array(embs)

    #https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne
    tsne_model = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=250, random_state=1)
    tsne_values = tsne_model.fit_transform(embs.tolist())
    plt.close()
    plt.figure(figsize=(8, 8)) 
    for i in range(len(tsne_values[:, 0])):
        plt.scatter(tsne_values[i, 0],tsne_values[i, 1], color='black')
        plt.axis('off')
        plt.annotate(annotation[i],
                    xy=(tsne_values[i, 0], tsne_values[i, 1]),
                    xytext=(5, 2),
                    textcoords='offset points',
                    color='black',
                    ha='right',
                    va='bottom')
    plt.show()

In [None]:
sel_topics = [11,34,72,44,31,32,61,73,38,46,12,68]
plot_alpha(alpha,sel_topics,9,2)

In [None]:
def plot_selected_topic_tsne(alpha, selected_times, selected_topic, vocab, num_neighbors, embeddings, perplexity):
    embs = []
    annotation = []
    for i in range(len(alpha[selected_topic,:,:])):
        if i in selected_times:
            embs.append(alpha[selected_topic,i,:])
            annotation.append('Topic-{}_Time-{}'.format(selected_topic,i))

    for t in range(len(alpha[selected_topic,:,:])):
        n,_ = nearest_neighbors_from_vector(alpha[selected_topic,t,:], embeddings, vocab, num_neighbors)
        for word in n:
            if word not in annotation:
                annotation.append(word)
                embs.append(embeddings[vocab.index(word)])
    embs = np.array(embs)
    
    tsne_model = TSNE(perplexity=perplexity, n_components=2, init='pca', n_iter=250, random_state=1)
    tsne_values = tsne_model.fit_transform(embs.tolist())
    plt.close()
    plt.figure(figsize=(5, 5)) 
    for i in range(len(tsne_values[:, 0])):
        plt.scatter(tsne_values[i, 0],tsne_values[i, 1], color='black')
        plt.annotate(annotation[i],
                    xy=(tsne_values[i, 0], tsne_values[i, 1]),
                    xytext=(5, 2),
                    textcoords='offset points',
                    color='black',
                    ha='right',
                    va='bottom')
        plt.axis('off')
    plt.show()

In [None]:
plot_selected_topic_tsne(alpha=alpha, selected_times=[0,9], selected_topic = 72, vocab=vocab, num_neighbors=1, embeddings=embeddings, perplexity=10)