Run gensim LDA on news, dimension reduce with Sklearn's TSNE, visualize in Bokeh with a date range slider.

In [1]:
'''Imports and configuration'''

import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO #this can be more or less verbose

import pandas as pd
import numpy as np
import time
# import seaborn as sns
import string
import multiprocessing
import os.path

import gensim

#Parameters
NUM_DATA = 2000 #number of data points
RETRAIN_MODEL = True
RERUN_TSNE = True

load_path = "./datasets" #path to load data
save_path = "./finra/" #path to save data

#Preprocessing parameters
LOW_THRESH = 2 #lower cutoff
HI_THRESH = 0.2 #upper cutoff

#LDA parameters
NUM_TOPICS = 10
PASSES = 5
MODEL_NAME = "lda_finra"

#TSNE parameters
MODE = "tsne"
NUM_EXAMPLE = NUM_DATA #if you want to plot fewer points
TSNE_RESULTS_NAME = "tsne_finra"
DIMENSIONS = 2

#plot parameters
MAKE_PLOT = True
START_DATE="1/1/1980"
END_DATE="1/1/2015"
JUMP=1

INFO : 'pattern' package found; tag filters are available for English


In [2]:
%%time
'''Load data. Make sure you set the right path!'''

# directory = os.path.dirname(save_path)
# if not os.path.exists(directory):
#     os.makedirs(directory)
# print(os.path.exists(load_path))

# news = pd.read_csv(load_path + "uci-news-aggregator.csv", engine='python')
import os

news = pd.read_csv(load_path + '/brokercheck_firms.csv')

# news = pd.DataFrame(columns=['id','TITLE'])

# for i, filename in enumerate(os.listdir(load_path)):
#     if filename.endswith(".txt"): 
        
# #         print(filename[:-4])
#         with open(load_path + '/' + filename, 'r') as f:
#             news.loc[i] = [int(filename[:-4]), f.readlines()]

# news.reset_index(inplace=True)
# news.to_csv('finra_drp')

print(news.shape)
# Display some columns
print(news.head())

(27232, 9)
                          firm     crd  \
0  FORGE FINANCIAL GROUP, INC.  100020   
1  FORGE FINANCIAL GROUP, INC.  100020   
2  FORGE FINANCIAL GROUP, INC.  100020   
3  FORGE FINANCIAL GROUP, INC.  100020   
4  FORGE FINANCIAL GROUP, INC.  100020   

                                             address date_initiated  \
0  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     05/10/2011   
1  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     09/09/2005   
2  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     10/06/2004   
3  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     12/31/2002   
4  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     11/01/2002   

  resolution_date                                        allegations  \
0      06/01/2011  RESPONDENT FORGE FINANCIAL GROUP, INC. FAILED ...   
1      11/08/2005  TRADE REPORT VIOLATIONS MRD200342689,MRD200444...   
2      10/06/2004  NASD CONDUCT RULE 2110 AND 3010, MARKETPLACE R...   
3      03/28/2003  NA

In [3]:
# '''randomize the data! Also get a smaller sample'''

# news = news.reindex(np.random.permutation(news.index))
# # print(news.head())
# news = news.head(NUM_DATA).reset_index(drop=True)
# print(news.head())


In [4]:
#convert unix time to human-readable
# from datetime import datetime

# ts = news.TIMESTAMP[1]//1000
# print(ts)

# # ts = 1394470372

# print(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))
# print(datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d'))

# news['DATETIME'] = news.TIMESTAMP.apply(lambda ts: datetime.utcfromtimestamp(ts//1000).strftime('%Y-%m-%d'))
# print(news.DATETIME)

# # news.to_csv("uci-news-aggregator.csv")

Let's train our LDA model. We only need to do this once, afterwards we can save it to disk. We preprocess, construct a corpus and dictionary. Parameters are 20 topics, multicore. Afterwards we do a sanity check using pyLDAvis.

In [5]:
'''#custom fast tweet cleaner. Removes stopwords, stems, removes urls'''

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re

TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

def preprocess(text, stem=False):
    # Remove link, user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
#     return " ".join(tokens)
    return tokens


In [6]:
%%time

'''Clean text.'''
text = news.allegations
clean_text = text.apply(lambda x: preprocess(x))
clean_text.head()

Wall time: 32.1 s


In [7]:
%%time
'''Prepare dictionary and corpus. Also need mapping of original to cleaned text, for reverse indexing.'''

from gensim import corpora
import numpy
import random

#Apply tokenizer
# clean_text = tweets['text'].apply(tokenize)

# dictionary is a mapping of words to id. Filter out words that are too common/too rare
id2word = corpora.Dictionary(clean_text)
id2word.filter_extremes(no_below=LOW_THRESH, no_above=0.2)
print(id2word)

#corpus is a list of bag-of-words vectors for every document. Format is (word_id, word_freq)
corpus = [id2word.doc2bow(txt) for txt in clean_text]

#persist the dictionary and corpus
# id2word.save(save_path + 'tweets_news.dictionary')
# id2word = gensim.corpora.Dictionary.load('./data/brexit.dictionary')

# gensim.corpora.MmCorpus.serialize('./data/lda_corpus.mm', corpus)
#corpus = gensim.corpora.MmCorpus('./data/lda_corpus.mm')


INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(11281 unique tokens: [u'taddonio', u'concessions', u'mdbi', u'yellow', u'trasnaction']...)
INFO : adding document #20000 to Dictionary(14944 unique tokens: [u'2011026502901', u'marketplacce', u'taddonio', u'concessions', u'ienatsch']...)
INFO : built Dictionary(16288 unique tokens: [u'2011026502901', u'marketplacce', u'taddonio', u'concessions', u'ienatsch']...) from 27232 documents (total 1364061 corpus positions)
INFO : discarding 7321 tokens: [(u'failed', 8124), (u'related', 7754), (u'affirmitive', 1), (u'mrd200342689', 1), (u'mrd200444715', 1), (u'mrd200444754', 1), (u'rule', 5835), (u'207reports', 1), (u'firm', 6841), (u'pewrcent', 1)]...
INFO : keeping 8967 tokens which were in no less than 2 and no more than 5446 (=20.0%) documents
INFO : resulting dictionary: Dictionary(8967 unique tokens: [u'mdbi', u'yellow', u'four', u'stonehurst', u'aegis']...)


Dictionary(8967 unique tokens: [u'mdbi', u'yellow', u'four', u'stonehurst', u'aegis']...)
Wall time: 2.57 s


In [8]:
'''print a sample of BOW documents. Careful inspection here is key to a good model.'''

text = text.reset_index(drop=True)

#given a corpus doc, prints the words, and original document
def print_doc(doc_index):
    print([(id2word[id], freq) for id, freq in corpus[doc_index]])
    print(text[doc_index], "\n")

# for i in range(10):
#     print_doc(i)

In [9]:
# %%time

'''Run LDA on BOW corpus and dictionary, or load one if it already exists.'''

if os.path.exists(save_path + 'lda_news.model') and not RETRAIN_MODEL:
    lda_model = gensim.models.LdaModel.load(save_path + 'lda_news.model')
    print("Loaded existing model")
else: 
    # Use a seed for reproducible topics
    seed = 42
#     numpy.random.seed(seed)
#     random.seed(seed)

    '''The LDA model. Parameters include number of topics, number of passes, alpha, eta, how often it updates... 
    This model updates once every 10K documents, and passes over the corpus 50 times. 
    The multicore version is faster, but can't do updated in an online manner 
    For an unknown number of topics, use HDP.'''
#     lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=40, 
#                                             random_state=seed, update_every=1, passes=1)
    lda_model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=NUM_TOPICS, 
                                                        random_state=seed, passes=5)
    lda_model.save(save_path + 'lda_news.model')


INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 0.1
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 5 passes over the supplied corpus of 27232 documents, updating every 22000 documents, evaluating every ~27232 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 11 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/27232, outstanding queue size 1
INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/27232, outstanding queue size 2
INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/27232, outstanding queue size 3
INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/27232, outstanding queue size 4
INFO : PROGRESS: pass 0, dispatched chunk #4 = documents up to #10000/27232, outstanding queue size 5
INFO : PROGRESS: pass 0, dispatched chunk #5 = documents up to #12000/27232, outstanding queue size 6
INFO : PR

INFO : PROGRESS: pass 2, dispatched chunk #2 = documents up to #6000/27232, outstanding queue size 3
INFO : PROGRESS: pass 2, dispatched chunk #3 = documents up to #8000/27232, outstanding queue size 4
INFO : PROGRESS: pass 2, dispatched chunk #4 = documents up to #10000/27232, outstanding queue size 5
INFO : PROGRESS: pass 2, dispatched chunk #5 = documents up to #12000/27232, outstanding queue size 6
INFO : PROGRESS: pass 2, dispatched chunk #6 = documents up to #14000/27232, outstanding queue size 7
INFO : PROGRESS: pass 2, dispatched chunk #7 = documents up to #16000/27232, outstanding queue size 8
INFO : PROGRESS: pass 2, dispatched chunk #8 = documents up to #18000/27232, outstanding queue size 9
INFO : PROGRESS: pass 2, dispatched chunk #9 = documents up to #20000/27232, outstanding queue size 10
INFO : PROGRESS: pass 2, dispatched chunk #10 = documents up to #22000/27232, outstanding queue size 11
INFO : PROGRESS: pass 2, dispatched chunk #11 = documents up to #24000/27232, out

INFO : PROGRESS: pass 4, dispatched chunk #8 = documents up to #18000/27232, outstanding queue size 9
INFO : PROGRESS: pass 4, dispatched chunk #9 = documents up to #20000/27232, outstanding queue size 10
INFO : PROGRESS: pass 4, dispatched chunk #10 = documents up to #22000/27232, outstanding queue size 11
INFO : PROGRESS: pass 4, dispatched chunk #11 = documents up to #24000/27232, outstanding queue size 12
INFO : PROGRESS: pass 4, dispatched chunk #12 = documents up to #26000/27232, outstanding queue size 13
INFO : PROGRESS: pass 4, dispatched chunk #13 = documents up to #27232/27232, outstanding queue size 14
INFO : merging changes from 22000 documents into a model of 27232 documents
INFO : topic #3 (0.100): 0.074*"use" + 0.056*"involved" + 0.052*"controversy" + 0.017*"executions" + 0.012*"execute" + 0.012*"suitability" + 0.011*"misrepresentation" + 0.011*"negligence" + 0.010*"trading" + 0.009*"market"
INFO : topic #5 (0.100): 0.018*"research" + 0.018*"alleged" + 0.017*"state" + 0.

In [10]:
'''Inspect topics'''

# from pprint import pprint

# # pprint(lda_model.print_topics())
# lda_model.print_topics()

'Inspect topics'

In [11]:
# %%time

'''visualize model with pyLDAvis, using PCA. TSNE is also possible, but probabilistic.
This step is also quite resource intensive - my personal 6 cores can't take it :(.'''

import pyLDAvis.gensim as gensimvis
import pyLDAvis

# pyLDAvis.enable_notebook()

print("displaying...")
vis_data = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis_data)

displaying...


RuntimeError: module compiled against API version 0xc but this version of numpy is 0xa

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


Next we feed the LDA vectors into TSNE to reduce the number of dimensions to 2. There are lots of parameters to tweak here. They will likely differ significantly depending on the size of the dataset.

In [12]:

n_topics = 20
threshold = 0
n_top_words = 5
n_iter = 500

num_example = NUM_EXAMPLE
print(lda_model[corpus][0])

#convert gensim corpus to numpy array
X_topics = gensim.matutils.corpus2dense(lda_model[corpus], 20)
X_topics = np.transpose(X_topics)
print(X_topics.shape)

_idx = np.amax(X_topics, axis=1) > threshold  # idx of news that > threshold
_topics = X_topics[_idx]

if MODE == "tsne":
    from sklearn.manifold import TSNE

    # t-SNE: 50 -> 2D, returns a numpy array
    if RERUN_TSNE:
        start = time.time()
        tsne_model = TSNE(n_components=DIMENSIONS, verbose=1, random_state=0, init='pca')
        tsne_lda = tsne_model.fit_transform(_topics[:num_example])
        print('TSNE took {} seconds'.format(time.time()-start))

        #save TSNE results
        np.save(save_path + TSNE_RESULTS_NAME + '.npy', tsne_lda)

    else:
        tsne_lda = np.load(save_path + TSNE_RESULTS_NAME + '.npy')
        print("Loaded TSNE matrix")
elif MODE == "pca":
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=2)
    tsne_lda = pca.fit(_topics).transform(_topics)
    
    print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))
    

[(1, 0.3862457), (8, 0.57738334)]
(27232L, 20L)
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2000 samples in 0.002s...
[t-SNE] Computed neighbors for 2000 samples in 0.111s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2000
[t-SNE] Computed conditional probabilities for sample 2000 / 2000
[t-SNE] Mean sigma: 0.000002
[t-SNE] KL divergence after 250 iterations with early exaggeration: 60.180687
[t-SNE] KL divergence after 1000 iterations: 0.461614
TSNE took 11.5770001411 seconds


Finally we visualize using Bokeh. We use a scatterplot, add a legend, and add a date range slider.

In [13]:
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
from bokeh.io import output_notebook

# find the most probable topic for each headline
_lda_keys = []
for i in range(_topics.shape[0]):
    _lda_keys += _topics[i].argmax(),
_lda_keys = _lda_keys[:NUM_EXAMPLE]

print(tsne_lda.shape)
keys = np.array([_lda_keys])
print(keys.shape)
concat = np.concatenate((tsne_lda, keys.T), axis=1)
print(concat.shape)
np.save(save_path + 'topic_matrix.npy', concat) # this is useful for other plotting libraries

# # show topics and their top words
topic_summaries = []
# # vocab = cvectorizer.get_feature_names()

for i in range(NUM_TOPICS):
    topic_words = [str(x[0]) for x in lda_model.show_topic(i, topn=5)]
    topic_summaries.append(str(i) + ': ' + ' '.join(topic_words))
topic_summaries = np.asarray(topic_summaries, dtype=str)

# 20 colors
colormap = np.array([
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

# plot
print("plotting...")
# print(num_example)
# print(news[:5])
# print(_lda_keys[:5])

title = "[News Aggregator] t-SNE visualization of LDA model trained on {} news, " \
        "{} topics, {} iter ({} data " \
        "points and top {} words)".format(
X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)

print(topic_summaries)

(2000L, 2L)
(1L, 2000L)
(2000L, 3L)
plotting...
['0: 1 exchange rules nasd section' '1: act 1 section 2 exchange'
 '2: nasd rules 2110 finra alleged'
 '3: use involved controversy executions execute'
 '4: order system rules oats reports'
 '5: state alleged research certain sale'
 '6: misrepresentation negligence suitability contract breach'
 '7: findings supervisory without procedures sales'
 '8: transactions report trace g alleged'
 '9: exchange nyse violated procedures act']


In [14]:
output_notebook()

In [26]:
from datetime import date, datetime


#tsne_lda - point coordinates
#_lda_keys - list of topic key per news
#content is for hover-over

#create a base dataframe, parse dates
news_df = news.dropna() # pd.Dataframe(columns=['headline', 'x', 'y', 'color', 'topic_key', 'date'])
news_df = news_df[:NUM_EXAMPLE]
# news_df = news[['TITLE', 'DATETIME', 'TIMESTAMP']] # pd.Dataframe(columns=['headline', 'x', 'y', 'color', 'topic_key', 'date'])

news_df['DATETIME'] = news_df['date_initiated'].apply(lambda x : datetime.strptime(str(x), "%m/%d/%Y"))
news_df['x'] = tsne_lda[:, 0]
news_df['y'] = tsne_lda[:, 1]
news_df['fill_colors'] = colormap[_lda_keys]
news_df['topic_key'] = _lda_keys
news_df['topic_words'] = topic_summaries[_lda_keys]

#save a dataframe containing everything, useful for visualizing
news_df.to_csv(save_path + 'news_df.csv')

# print(type(news_df['DATETIME'][0]))
# print(news.DATETIME.tail())
print(news_df.head())
print(news_df.tail())

#function for making the dataset filtered by time range
def make_dataset(range_start, range_end):
    
#     assert range_start < range_end, "Start must be less than end!"
    
    news_df_copy = news_df.copy()
    news_df_copy = news_df_copy.loc[(news_df_copy['DATETIME'] > range_start) & (news_df_copy['DATETIME'] < range_end)]
    
    x_values = news_df_copy['x']
    y_values = news_df_copy['y']
    fill_colors = news_df_copy['fill_colors']
    
    firms = news_df_copy['firm']
    crds = news_df_copy['crd']
    contents = news_df_copy['allegations']
    topic_keys = news_df_copy['topic_key']
    dates = news_df_copy['DATETIME'].apply(lambda x: str(x))
    topic_words = news_df_copy['topic_words']
    
    data={
        "x_values": x_values,
        "y_values": y_values,
        "fill_color": fill_colors,
        "firm": firms,
        "crd": crds,
        "content": contents,
        
        "topic_key": topic_keys,
        "topic_words": topic_words,
        "date": dates
        }
    
    return ColumnDataSource(data)

def make_plot(src):
    
    plot_lda = bp.figure(plot_width=1000, plot_height=800,
                    title=title,
                    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                    x_axis_type=None, y_axis_type=None, min_border=1)
    
    plot_lda.scatter(x="x_values", y="y_values",
                    marker="circle",
                    size=15,
                    line_color=None,
                    fill_color="fill_color",
                    fill_alpha=0.5,
                    legend="topic_words",
                    source=src
                    )
    return plot_lda

start=date(1980, 1, 1)
end=date(2015, 1, 1)

src = make_dataset(start, end)
plot_lda = make_plot(src)
# hover tools
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = [("firm", "@firm"),
                  ("crd", "@crd"),
                    ("content", "@content"),
                  ("topic", "@topic_key"),
                 ("date", "@date")]

bp.show(plot_lda)

                                firm     crd  \
1        FORGE FINANCIAL GROUP, INC.  100020   
2        FORGE FINANCIAL GROUP, INC.  100020   
3        FORGE FINANCIAL GROUP, INC.  100020   
5  MULTIPLE FINANCIAL SERVICES, INC.  100100   
6                 WALL STREET ACCESS   10012   

                                             address date_initiated  \
1  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     09/09/2005   
2  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     10/06/2004   
3  301 YAMATO ROAD, SUITE 4160, BOCA RATON, FL  3...     12/31/2002   
5  15615 ALTON PARKWAY, SUITE 450, IRVINE, CA  92618     01/27/2012   
6   17 BATTERY PLACE 11TH FLOOR, NEW YORK, NY  10004     07/11/2014   

  resolution_date                                        allegations  \
1      11/08/2005  TRADE REPORT VIOLATIONS MRD200342689,MRD200444...   
2      10/06/2004  NASD CONDUCT RULE 2110 AND 3010, MARKETPLACE R...   
3      03/28/2003  NASD CONDUCT RULES 2110 AND 3010, AND NASD MAR...

'datetime.date' is coerced to a datetime. In the future pandas will
not coerce, and a TypeError will be raised. To retain the current
behavior, convert the 'datetime.date' to a datetime with
'pd.Timestamp'.


In [27]:
# news_df['date_initiated']

The chart works, so now it's time to add some interactivity to it! We need to create a slider widget, an event handler linked to the widget, and a callback function that modifies make_dataset() and updates the source.

Bokeh is interesting in how it works, because it essentially converts everything to Javascript, except for the interactive parts. So either we have to write the event handler and callback function in JS (which is annoying for manipulating a Pandas dataframe), I chose to instead make a Bokeh server. 

Jupyter notebook has a unique way of interacting with the server, called ipywidgets. Essentially they allow directly tweaking the parameters inside our update function - pretty nifty, but alas, they don't have native support for dates. That's when Stack Overflow came to the rescue!

In [28]:
'''Add widget, event handler and callback function'''

from datetime import date

from bokeh.io import output_file, show, push_notebook
from bokeh.layouts import widgetbox
from bokeh.models.widgets import RangeSlider, DateRangeSlider
from bokeh.models import CustomJS

start=date(1980, 1, 1)
end=date(2015, 1, 1)

#update function, only works on server
def update(range_):
    
    range_start = range_[0]
    range_end = range_[1]
    
    # Create new ColumnDataSource
    new_src = make_dataset(range_start, range_end)
    
    # Update the data on the plot
    src.data.update(new_src.data)
    push_notebook()

#create widget
range_select = DateRangeSlider(title="Date Range: ", start=date(1980, 1, 1), end=date(2015, 1, 1), value=(start, end), step=1)
# range_select.on_change('value', update)

# show(widgetbox(range_select))

In [29]:
'''Join all the elements using a layout'''

from bokeh.layouts import column, row, WidgetBox
from bokeh.models import Panel
from bokeh.models.widgets import Tabs

controls = WidgetBox(range_select)
layout = column(controls, plot_lda)
# tab = Panel(child=layout, title = 'Visualization')
# tabs = Tabs(tabs=[tab])
show(layout, notebook_handle=True)

In [31]:
import ipywidgets as widgets
from ipywidgets import interact

# import pandas as pd
# from datetime import datetime

start_date = datetime(1980, 1, 1)
end_date = datetime(2015, 1, 1)

dates = pd.date_range(start_date, end_date, freq='D')

options = [(date.strftime(' %d %b %Y '), date) for date in dates]
index = (0, len(options)-1)

selection_range_slider = widgets.SelectionRangeSlider(
    options=options,
    index=index,
    description='Dates',
    orientation='horizontal',
    layout={'width': '500px'}
)

# selection_range_slider
interact(update, range_=selection_range_slider)

aW50ZXJhY3RpdmUoY2hpbGRyZW49KFNlbGVjdGlvblJhbmdlU2xpZGVyKGRlc2NyaXB0aW9uPXUnRGF0ZXMnLCBpbmRleD0oMCwgMTI3ODQpLCBsYXlvdXQ9TGF5b3V0KHdpZHRoPXUnNTAwcHjigKY=


<function __main__.update>

So that's the basic functionality. After improving the quality of LDA and TSNE, the next useful step would probably be to try a different visualization method, like PCA. I suspect the TSNE output right now is a little too neat. There will probably be a lot more overlap, so then I'll want to enable muting parts of the legend. I also really want to make it 3D, but Bokeh doesn't have that support yet. Plotly does, and surprisingly Matplotlib too!


In [20]:
# from mpl_toolkits import mplot3d
# import matplotlib.pyplot as plt

# xdata = tsne_lda[:,0]
# ydata = tsne_lda[:,1]
# zdata = tsne_lda[:,2]
# colors = news_df['fill_colors']

In [21]:
# %matplotlib inline

# fig = plt.figure()
# ax = plt.axes(projection='3d')

# ax.scatter3D(xdata, ydata, zdata, c=colors)

# plt.show()