# MHS-Word2Vec Dashboard

In [1]:
import re, json, warnings, pickle, itertools, operator, gensim
import pandas as pd
import numpy as np

# Primary visualizations
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
import plotly.express as px

# PCA visualization
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise
from sklearn.manifold import MDS, TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

# Import (Jupyter) Dash -- App Functionality
import dash, dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
from jupyter_dash import JupyterDash

# Ignore simple warnings.
warnings.simplefilter('ignore')

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"

# Load model.
model = gensim.models.KeyedVectors.load_word2vec_format(abs_dir + 'Data/Output/WordVectors/jqa_w2v.txt')

# Load pca + tsne coordinates.
tsne_data = pd.read_csv(abs_dir + '/Data/Output/WordVectors/jqa_w2v_tsne-coordinates.csv', sep = ',')

## Functions

In [2]:
%%time


def zip_longest_no_fill(a, b):
    for i in itertools.zip_longest(a, b, fillvalue = ''):
        yield tuple(x for x in i if x is not '')


def construct_graph(data, word, topn):
    word_list = []
    for i in model.most_similar([word], topn = topn):
        word_list.append(i[0])
    
    dff = data[data.words.isin(word_list + [word])]
    
    dff['color'] = np.where(dff['words'] != word, '#37718E', '#AEF3E7')
    
    fig = px.scatter_3d(dff, x = 'x', y = 'y', z = 'z',
                        text = 'words',
                        title = f't-SNE Cluster of "{word}" in dJQA.',
                        color = 'color'
#                         color_discrete_sequence = px.colors.qualitative.D3
                       )
    
    return fig

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs


## App

In [3]:
%%time

# App configurations
app = JupyterDash(__name__, 
                  external_stylesheets = [dbc.themes.DARKLY],
                  meta_tags=[
                      {"name": "viewport", "content": "width=device-width, initial-scale=1"},
                  ],
                 )

app.config.suppress_callback_exceptions = True

analogy_dropdown = [
    {'label': '+', 'value': '+'},
    {'label': '-', 'value': '-'},
#     {'label': '*', 'value': '*'},
#     {'label': '/', 'value': '/'},
#     {'label': 'None', 'value': ''}
]

ops = {
    '+' : operator.add,
    '-' : operator.sub,
    '*' : operator.mul,
    '/' : operator.truediv
}

# Layout.
app.layout = html.Div(
    className = 'app-body',
    children = [

#             app-header
        html.Header(
            className="app-header",
            children = [
                html.Div('Word2Vec Dashboard', className = "app-header--title")
            ]),
        
        html.H1('Word2Vec Exploration'),
        html.P('Explanation.'),
        
#         Input, slider, and similarity description.
        dbc.Row(
            children = [
                dbc.Col(children = [
                    dbc.Input(id = 'text', type = 'text', value = 'work', debounce = True),
                    
                    dcc.ConfirmDialog(id = 'explore-alert', 
                                      message = 'EXPLORE ALERT: Sorry, a word you\'ve entered does not appear in the corpus.'),
                        
                    dcc.Slider(id = 'slider', min = 5, max = 35, step = 1, value = 15,
                               marks = {str(i): str(i) for i in range(5, 35, 5)}),
                ]),
                
                dbc.Col(children = [
                    html.P('Description of "similarity" value.')
                ])

            ]),
        
#         t-SNE cluster & data table.
        dbc.Row(children = [
            
            dbc.Col(dcc.Graph(id = 'text_plot')),
            dbc.Col(dash_table.DataTable(id = 'cosine-table'))
        ]),
        
        html.H1('Word2Vec Analogies'),
        html.P('Explanation'),
        
#         Analogies input.
        dbc.Row(
            children = [
                
                dbc.Col(children = [
                    dbc.Input(id = 'text2', type = 'text', value = 'work', debounce = True)
                ]),
                
                dbc.Col(children = [
                    dcc.Dropdown(id = 'dropdown1', options = analogy_dropdown, value = '+')
                ]),
                
                dbc.Col(children = [
                    dbc.Input(id = 'text3', type = 'text', value = 'congress', debounce = True)
                ]),
                
                dbc.Col(children = [
                    dcc.Dropdown(id = 'dropdown2', options = analogy_dropdown, value = '+')
                ]),
                
                dbc.Col(children = [
                    dbc.Input(id = 'text4', type = 'text', value = 'states', debounce = True)
                ]),
                
                dcc.ConfirmDialog(id = 'analogy-alert', 
                                  message = 'ANALOGIES ERROR: Sorry, a word you\'ve entered does not appear in the corpus.'),
            ]),
        
#         Analogies data table.
        dbc.Row(children = [
                     dash_table.DataTable(id = 'analogy-table')
                 ], 
                justify = 'center'),
])


###########################
######### Callbacks #######

###########################

@app.callback(Output('explore-alert', 'displayed'), 
              Input('text', 'value')
)
def raise_alert(text):
    try:
        if model.key_to_index[text]:
            error_msg = False
            return error_msg
        
        else:
            error_msg = True
            return error_msg
            raise PreventUpdate
            
    except KeyError:
        error_msg = True
        return error_msg
        raise PreventUpdate

        
# Update data table & graph.
@app.callback(
    [Output('text_plot', 'figure'), Output('cosine-table', 'data'), Output('cosine-table', 'columns')],
    [Input('text', 'value'), Input('slider', 'value')]
)
def update_dataTable_graph(text, slider):    
    try:
        graph = construct_graph(tsne_data, text, slider)

        sims = model.most_similar([text], topn = slider)
        cos_df = pd.DataFrame(sims, columns = ['word', 'similarity'])
        cos_df['similarity'] = cos_df['similarity'].round(3)
        cols = [{'name': i, 'id': i} for i in cos_df.columns]

        return graph, cos_df.to_dict('rows'), cols
    
    except:
        return dash.no_update, dash.no_update, dash.no_update


# Update analogy table.
@app.callback(
    [Output('analogy-table', 'data'), Output('analogy-table', 'columns')],
    [Input('text2', 'value'), Input('dropdown1', 'value'), 
     Input('text3', 'value'), Input('dropdown2', 'value'), Input('text4', 'value')]
)
def update_analogies(text2, dropdown1, text3, dropdown2, text4):
#     Different results from model.similar_by_vector & model.most_similar
#     https://stackoverflow.com/questions/50275623/difference-between-most-similar-and-similar-by-vector-in-gensim-word2vec

#     text_input = []
#     for i in [text2, text3, text4]:
#         if i != '':
#             text_input.append(i)
            
#     ops_input = []
#     for i in [dropdown1, dropdown2]:
#         if i != '':
#             ops_input.append(ops[i])
            
#     vec_input = list(itertools.chain(*list(zip_longest_no_fill(text_input, ops_input))))
#     print (vec_input)
    
    try:
        vector = model.similar_by_vector(ops[dropdown2](ops[dropdown1](model[f"{text2}"],
                                                                       model[f"{text3}"]),
                                                        model[f"{text4}"]))

        vector_results = pd.DataFrame(vector, columns = ['word', 'similarity'])
        vector_results['similarity'] = vector_results['similarity'].round(3)

        cols = [{'name': i, 'id': i} for i in vector_results.columns]

        return vector_results.to_dict('rows'), cols
    
    except KeyError:
        return dash.no_update, dash.no_update


# Analogies/KeyError Alert.
@app.callback(Output('analogy-alert', 'displayed'), 
              [Input('text2', 'value'), Input('text3', 'value'), Input('text3', 'value')]
)
def raise_alert(text2, text3, text4):
    try:
        if all(model.key_to_index[t] for t in [text2, text3, text4]):
            error_msg = False
            return error_msg
        
        else:
            error_msg = True
            return error_msg
            raise PreventUpdate
            
    except KeyError:
        error_msg = True
        return error_msg
        raise PreventUpdate
        
        

if __name__ == "__main__":
#     app.run_server(mode = 'inline', debug = True) # mode = 'inline' for JupyterDash
    app.run_server(debug = True)

Dash app running on http://127.0.0.1:8050/
CPU times: user 30.6 ms, sys: 13.3 ms, total: 43.9 ms
Wall time: 53.5 ms


In [4]:
model.similar_by_vector(model["work"] + model["congress"] * model["states"])

[('work', 0.8534066677093506),
 ('library', 0.5782566666603088),
 ('tomb', 0.5727782249450684),
 ('convenient', 0.5504840612411499),
 ('researches', 0.5455988049507141),
 ('study', 0.5386787056922913),
 ('show', 0.5370733141899109),
 ('materials', 0.537021815776825),
 ('pediment', 0.5339873433113098),
 ('allotted', 0.5324952602386475)]