In [2]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

In [3]:
nlp = spacy.en.English()

# Grab the 2012 political convetion data set and preview it

In [4]:
convention_df = st.SampleCorpora.ConventionData2012.get_data()

In [5]:
convention_df.iloc[0]

party                                               democrat
speaker                                         BARACK OBAMA
text       Thank you. Thank you. Thank you. Thank you so ...
Name: 0, dtype: object

In [6]:
print("Document Count")
print(convention_df.groupby('party')['text'].count())
print("Word Count")
convention_df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())

Document Count
party
democrat      123
republican     66
Name: text, dtype: int64
Word Count


party
democrat      76871
republican    58144
dtype: int64

# Turn it into a Scattertext corpus, and have spaCy parse it.

In [7]:
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text', 
                          nlp=nlp).build()

# Scattertext has built-in functions for finding characteristic words in the corpus


In [8]:
list(corpus.get_scaled_f_scores_vs_background().index[:10])

['obama',
 'romney',
 'barack',
 'mitt',
 'obamacare',
 'biden',
 'romneys',
 'hardworking',
 'bailouts',
 'autoworkers']

In [9]:
from gensim.models import doc2vec

In [10]:
docs = []
#for label, text in zip(np.array(corpus.get_categories())[corpus._y], corpus.get_texts()):
for label, text in zip(corpus._y, corpus.get_texts()):
    for sent in nlp(text).sents:
        docs.append(doc2vec.TaggedDocument(words=[w.lower_ for w in sent if not w.is_punct], tags=[label]))

In [11]:
# #https://github.com/scnakandala/twitch-gender/blob/68b50daffe5943f25524367a8f0e69ca5a4d70b7/exps/20160825_channel_classification/channel_all_chat_analysis.ipynb
d2v = doc2vec.Doc2Vec(min_count=5, window=5, size=100, sample=1e-5, negative=5, workers=4, dm=0, dbow_words=1)
d2v.scan_vocab(docs)
d2v.build_vocab(docs)
d2v.train(docs, total_examples=len(docs), epochs=100)

2500823

In [12]:
pprint(d2v.wv.most_similar_cosmul(['jobs'], topn=20))

[('created', 0.9921624660491943),
 ('top', 0.9915152788162231),
 ('provides', 0.9909766912460327),
 ('invest', 0.9909660816192627),
 ('reduce', 0.9904212355613708),
 ('month', 0.9903976321220398),
 ('lower', 0.9902100563049316),
 ('commission', 0.9900608658790588),
 ('puts', 0.9899851083755493),
 ('math', 0.9897427558898926),
 ('creating', 0.9890012741088867),
 ('secretary', 0.988820493221283),
 ('hole', 0.988700807094574),
 ('improve', 0.9878247976303101),
 ('manufacturing', 0.9875862002372742),
 ('2,000', 0.987096905708313),
 ('helped', 0.9869441390037537),
 ('reform', 0.9868896007537842),
 ('clinton', 0.9868168830871582),
 ('wealthiest', 0.9868075847625732)]


# And it has some functions for finding terms which are associtated with specific categories
Please see http://www.slideshare.net/JasonKessler/turning-unstructured-content-into-kernels-of-ideas/58 for an explanation of Scaled F-Score.

In [13]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')
term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')
print("Top 10 Democratic terms")
print(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))
print("Top 10 Republican terms")
print(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))

Top 10 Democratic terms
['auto', 'america forward', 'insurance companies', 'auto industry', 'pell', 'last week', 'pell grants', "women 's", 'platform', 'coverage']
Top 10 Republican terms
['big government', "n't build", 'the constitution', 'mitt was', 'turned around', 'he wanted', 'of mitt', 'hands that', '16 trillion', 'in florida']


# Make and visualize chart, scale based on raw frequency.
### - A word used 10 times by Republicans will be at position 10 on the on the x-axis 
### - This isn't very useful.  Everything but the most frequent terms are squished the lower-left corner
### - The corner-distance scores are largely stopwords

In [14]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    transform=st.Scalers.scale,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


## Using log scales seems to help a bit, but blank space and stop words still dominate the graph
### The chracteristic terms look much more informative

In [16]:
html = st.produce_scattertext_explorer(corpus,
                                       category='democrat',
                                       category_name='Democratic',
                                       not_category_name='Republican',
                                       width_in_pixels=1000,
                                       transform=st.Scalers.log_scale_standardize)
file_name = 'Conventions2012ScattertextLog.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# Rank terms by frequency percentiles instead of raw frequenies.  
### A term at the middle of the x-axis will be mentioned by Republicans at the median frequency.
### This nicely distributes terms throughout the space
### But, terms occuring with the same frequencies in both classes are stacked atop each other.
### Can't mouseover points not at top of stack.

In [21]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankData.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# One solution is to randomly jitter each point
## Points don't leave enough space for many labels
## Top terms laregely result of jitter

In [22]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    jitter=0.1,
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankDataJitter.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# The preferred solution is to fall back to alphabetic order among equally frequent terms
## Lets you mouseover all points
## Leaves a bit of room for labels
## Top points may be slightly distorted

In [23]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'],
                                    term_significance = st.LogOddsRatioUninformativeDirichletPrior())
file_name = 'Conventions2012ScattertextRankDefault.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

In [25]:
from scattertext import word_similarity_explorer
html = word_similarity_explorer(corpus,
                                category='democrat',
                                category_name='Democratic',
                                not_category_name='Republican',
                                target_term='jobs',
                                minimum_term_frequency=5,
                                pmi_filter_thresold=4,
                                width_in_pixels=1000,
                                metadata=convention_df['speaker'])
file_name = 'demo_similarity.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)