# MHS-Word2Vec Dashboard

In [13]:
import re, json, warnings, gensim
import pandas as pd
import numpy as np

# Primary visualizations
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns

# PCA visualization
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise
from sklearn.manifold import MDS, TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

# Import (Jupyter) Dash -- App Functionality
import dash
from dash.dependencies import Input, Output, State
import dash_table
import dash_core_components as dcc
import dash_html_components as html
from jupyter_dash import JupyterDash

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

# Load model.
model = gensim.models.KeyedVectors.load_word2vec_format(abs_dir + 'Data/Output/WordVectors/jqa_w2v.txt')

## Functions

In [4]:
%%time

# https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial
def tsnescatterplot(model, word, list_names):
    """
    Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    
    arrays = np.empty((0, 100), dtype='f') # 100 == vector size when model was created.
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:        
        wrd_vector = model.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.__getitem__([wrd[0]])
        word_labels.append(wrd[0])
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to x dimensions with PCA; error will arise if x is too large.
    reduc = PCA(n_components=41).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
#     fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
            
    plt.xticks([])
    plt.yticks([])
    plt.xlabel("")
    plt.ylabel("")


    # add annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df['x'][line], 
                  df['y'][line], 
                  ' ' + df['words'][line].title(),
                  horizontalalignment = 'center', 
                 verticalalignment = 'bottom',
                  size = 'small', 
                  color = 'gray', 
                  weight = 'normal')

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 7.87 µs


## App

In [15]:
%%time

# App configurations
app = JupyterDash(__name__) 
app.config.suppress_callback_exceptions = True

# Plot configurations.
sns.set_style("whitegrid", {'axes.grid' : False})
font = {'family' : 'serif',
        'weight' : 'normal',
        'size'   : 18}
matplotlib.rc('font', **font)
palette = sns.color_palette("Set1", 4)
plt.figure(figsize=(25, 12))


# Layout.
app.layout = html.Div(
    className = 'wrapper',
    children = [
        
#             app-header
        html.Header(
            className="app-header",
            children = [
                html.Div('Word2Vec Dashboard', className = "app-header--title")
            ]),
        
#         content-wrapper
        html.Div(
            className = 'content-wrapper',
            children = [
                dcc.Input(id = 'text', type = 'text', placeholder = 'work'),
#                 dcc.Slider(id = 'slider', min = 5, max = 35, step = 1, value = 20),
                dcc.Graph(id = 'text-plot')
            ])
])


###########################

######### Callbacks #######

###########################


@app.callback(
    Output('text-plot', 'children'),
    Input('text', 'text_value'),
#     State('slider', 'slider_value')
)
def update_textPlot(text_value):
    fig = tsnescatterplot(model, text_value, model.most_similar([text_value], topn = 25))
    return fig


if __name__ == "__main__":
#     app.run_server(mode = 'inline', debug = True) # mode = 'inline' for JupyterDash
    app.run_server(debug = True)

Dash app running on http://127.0.0.1:8050/
CPU times: user 60 ms, sys: 56.1 ms, total: 116 ms
Wall time: 328 ms


<Figure size 1800x864 with 0 Axes>