In [1]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
from scattertext.FastButCrapNLP import fast_but_crap_nlp
display(HTML("<style>.container { width:98% !important; }</style>"))



In [2]:
nlp = spacy.en.English()

# Grab the 2012 political convetion data set and preview it

In [3]:
convention_df = st.SampleCorpora.ConventionData2012.get_data()

In [12]:
convention_df.iloc[0]

party                                               democrat
speaker                                         BARACK OBAMA
text       Thank you. Thank you. Thank you. Thank you so ...
Name: 0, dtype: object

# Turn it into a Scattertext corpus, and parsing the data with spaCy in the process.

In [5]:
corpus = CorpusFromPandas(convention_df,
                          category_col='party',
                          text_col='text', 
                          nlp=nlp).build()

# Scattertext has some built-in functions for letting us find characteristic words in the corpus

In [14]:
print("Characteristic terms of the corpus (as compared to a sample of English on the Web)")
import pprint
pprint.pprint(list(corpus.get_scaled_f_scores_vs_background().index[:10]))

Characteristic terms of the corpus (as compared to a sample of English on the Web)
['obama',
 'romney',
 'barack',
 'mitt',
 'obamacare',
 'biden',
 'romneys',
 'hardworking',
 'bailouts',
 'autoworkers']


# And it has some functions for finding terms which are associtated with specific categories
* Please see http://www.slideshare.net/JasonKessler/turning-unstructured-content-into-kernels-of-ideas/58 for an explanation of Scaled F-Score.

In [17]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Republican Score'] = corpus.get_scaled_f_scores('republican')
term_freq_df['Democratic Score'] = corpus.get_scaled_f_scores('democrat')
print("Top 10 Democratic terms")
pprint.pprint(list(term_freq_df.sort_values(by='Democratic Score', ascending=False).index[:10]))
print("Top 10 Republican terms")
pprint.pprint(list(term_freq_df.sort_values(by='Republican Score', ascending=False).index[:10]))

Top 10 Democratic terms
['auto',
 'america forward',
 'auto industry',
 'insurance companies',
 'pell',
 'last week',
 'pell grants',
 "women 's",
 'platform',
 'millionaires']
Top 10 Republican terms
['big government',
 "n't build",
 'mitt was',
 'the constitution',
 'he wanted',
 'hands that',
 'of mitt',
 '16 trillion',
 'turned around',
 'in florida']


# Make and visualize chart, scale based on raw frequency.
* A word used three times by Republicans will be at position 3 on the on the x-axis 
* This isn't very useful.  Everything but the most frequent terms are left in the lower-lefthand corner
* The corner-distance scores are largely stopwords

In [8]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    transform=st.Scalers.scale,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# Rank terms by frequency percentiles instead of raw frequenies.  
* A term at the middle of the x-axis will be mentioned by Republicans at the median frequency.
* This nicely distributes terms throughout the space
* But, terms occuring with the same frequencies in both classes are stacked atop each other.
* Can't mouseover points not at top of stack.

In [9]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankData.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# One solution is to randomly jitter each point
* Points don't leave enough space for many labels
* Top terms laregely result of jitter

In [10]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    jitter=0.1,
                                    transform=st.Scalers.percentile,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankDataJitter.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)


# The preferred solution is to fall back to lexicographic order among equally frequent terms
* Lets you mouseover all points
* Leaves a bit of room for labels
* Top points may be slightly distorted

In [11]:
html = produce_scattertext_explorer(corpus,
                                    category='democrat',
                                    category_name='Democratic',
                                    not_category_name='Republican',
                                    width_in_pixels=1000,
                                    metadata=convention_df['speaker'])
file_name = 'Conventions2012ScattertextRankDefault.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)