# MHS - Topic Model Dashboard

In [1]:
import re, json, warnings, pickle, nltk
import pandas as pd
import numpy as np
from scipy import stats

# Primary visualizations
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
import plotly.express as px

# Import NLTK packages.
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Import sklearn packages.
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import LDA visualizer.
import pyLDAvis, pyLDAvis.sklearn
pyLDAvis.enable_notebook()

# Import (Jupyter) Dash -- App Functionality
import dash, dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
from jupyter_dash import JupyterDash

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

# Import and append stopwords.
stop_words = stopwords.words("english")
stop_words.append('mr')

# Declare directory location to shorten filepaths later.
abs_dir = "/Users/quinn.wi/Documents/"


# Read in file; select columns; drop rows with NA values (entries without a named person).
topics = pd.read_csv(abs_dir + 'Data/Output/TopicModels/jqa_topics-40.csv', sep = ',') \
    .dropna()

topic_keywords = pd.read_csv(abs_dir + 'Data/Output/TopicModels/jqa_topics-40-keywords.csv', sep = ',')

topic_corr = pd.read_csv(abs_dir + 'Data/Output/TopicModels/jqa_topics-40_corr.csv', sep = ',')

ldaVis_json = abs_dir + 'Data/Output/TopicModels/jqa_topics-40_pyLDAvis.json'
ldaVis_html = abs_dir + 'Data/Output/TopicModels/jqa_topics-40_pyLDAvis.html'

In [2]:
%%time

trends = pd.melt(topics, id_vars = ['file', 'entry', 'date', 'subject'],
                 var_name = 'topic', value_name = 'proportion')

trends = trends \
    .assign( year = pd.to_datetime(trends['date']).dt.year ) \
    .groupby(['year', 'topic'])['proportion'].mean() \
    .reset_index()


# pyLDAvis do not match topic order of keywords...


# Drop down menu to select topics.
# Trends drawn on graph.
# Selected topics populate table; with keywords.


# # Plot graph.
# sns.set(rc = {"figure.figsize": (12, 6)})
# sns.set_style("white")

# ax = sns.lineplot(data = trends, x = 'year', y = 'proportion', hue = 'topic')

CPU times: user 40.9 ms, sys: 5.66 ms, total: 46.5 ms
Wall time: 45.5 ms


## Functions

## App

In [3]:
%%time

# App configurations
app = JupyterDash(__name__, 
                  external_stylesheets = [dbc.themes.DARKLY],
                  meta_tags=[
                      {"name": "viewport", "content": "width=device-width, initial-scale=1"},
                  ],
                 )

app.config.suppress_callback_exceptions = True


# Read in pyLDA Vis.
with open(ldaVis_json, 'r') as f:
    ldaVis_graph = json.load(f)


app.layout = html.Div(
    className = 'app-body',
    children = [
        
        html.H1('Topic Models'),
        html.P('Description'),
        
        
#         Tabs.
        dbc.Tabs(id = 'dash-tabs', children = [
            
            dbc.Tab(label = 'pyLDAvis',
                    children = [
                        html.Div(className = 'container', children = [
                            html.Iframe(src = app.get_asset_url('jqa_topics-40_pyLDAvis.html')),
                        ]),
#                         dcc.Graph(id = 'ldaVis_graph', figure = ldaVis_graph)
            ]),
            
            dbc.Tab(label = 'Topic Keywords', children = [
                dash_table.DataTable(id = 'keywords',
                                     columns = [{'name':i, 'id': i} for i in topic_keywords.columns],
                                     data = topic_keywords.to_dict('rows'))
            ]),
            
            dbc.Tab(label = 'Topic Trends', children = []),
            
            dbc.Tab(label = 'lda-corr', children = [])
        ]),
    ]
)



###########################
######### Callbacks #######

###########################

# @app.callback(Output('tabs-content', 'children'), 
#               Input ('dash-tabs', 'value'))
# def render_content(tab):
    



if __name__ == "__main__":
#     app.run_server(mode = 'inline', debug = True) # mode = 'inline' for JupyterDash
    app.run_server(debug = True)

Dash app running on http://127.0.0.1:8050/
CPU times: user 40.1 ms, sys: 12.3 ms, total: 52.4 ms
Wall time: 63.7 ms
