# Topic-specific term associations through word representations
## How do Democrats and Republicans talk different about jobs

https://github.com/JasonKessler/scattertext

Cite as:
Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations. 2017.

Link to preprint: https://arxiv.org/abs/1703.00565

`
@article{kessler2017scattertext,
  author    = {Kessler, Jason S.},
  title     = {Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ},
  booktitle = {Proceedings of ACL-2017 System Demonstrations},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
}
`

In [1]:
%matplotlib inline
import scattertext as st
from gensim.models import word2vec
import re, io, itertools
from pprint import pprint
import pandas as pd
import numpy as np
import spacy.en
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

## Load the 2012 Conventions Dataset

In [2]:
import scattertext as st

In [3]:
nlp = spacy.en.English()
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parsed'] = convention_df.text.apply(nlp)
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()

## Use Gensim to run Word2Vec on the corpus.
### Word2Vec encodes each word in a dense K-dimensional vector space
### Cosine distances between terms vectors correspond to semantic similarity 

In [4]:
def get_line_sentences_for_word2vec(corpus):
    return itertools.chain(*[[[t.lower_ for t in sent if not t.is_punct] 
                              for sent in doc.sents]
                             for doc in corpus.get_parsed_docs()])

In [11]:
model = word2vec.Word2Vec(size=100, alpha=0.025, window=5, min_count=5, 
                          max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001, 
                          sg=1, hs=1, negative=0, cbow_mean=0, iter=1, null_word=0,
                          trim_rule=None, sorted_vocab=1)
model.scan_vocab(get_line_sentences_for_word2vec(corpus))
model.build_vocab(get_line_sentences_for_word2vec(corpus))
model.train(get_line_sentences_for_word2vec(corpus), 
            total_examples=model.corpus_count, 
            epochs=200)
model.wv.most_similar('taxes')

[('spending', 0.8147529363632202),
 ('stamps', 0.8113826513290405),
 ('jobs', 0.7794510126113892),
 ('lower', 0.742141842842102),
 ('cut', 0.7089222073554993),
 ('benefits', 0.6803330183029175),
 ('governments', 0.6802129745483398),
 ('per', 0.6799307465553284),
 ('costs', 0.6778221726417542),
 ('students', 0.6717503070831299)]

## Draw the Scattertext by only coloring points that have are associated with a category (p < 0.05 via log-odds w/ prior)
### The top Democratic and Republican terms are raked by their similarity to "jobs"
#### Only the terms associated to a category are considered. 
### On the far right, the most similar terms, regardless of category association, are listed.

In [14]:
target_term = 'jobs'

html = st.word_similarity_explorer_gensim(corpus,
                                          category='democrat',
                                          category_name='Democratic',
                                          not_category_name='Republican',
                                          target_term=target_term,
                                          minimum_term_frequency=5,
                                          pmi_filter_thresold=4,
                                          width_in_pixels=1000,
                                          word2vec=model,
                                          metadata=convention_df['speaker'])
file_name = 'demo_similarity_gensim.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

## Instead of using vectors trained on the Corpus, we can use the spaCy-provided word vectors trained on the Common Crawl Corpus.
### These are trained on a lot more data, but aren't specific to the corpus

In [15]:
html = st.word_similarity_explorer(corpus,
                                   category='democrat',
                                   category_name='Democratic',
                                   not_category_name='Republican',
                                   target_term='jobs',
                                   minimum_term_frequency=5,
                                   pmi_filter_thresold=4,
                                   width_in_pixels=1000,
                                   metadata=convention_df['speaker'])
file_name = 'demo_similarity.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)