In [1]:
import scattertext as st
import re
from pprint import pprint
import pandas as pd
import spacy.en
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))



In [2]:
nlp = spacy.en.English()

# Parse debates and create plotting interface
The function returns a Pandas data frame consisting of two columns, speaker and statement.  Speaker is the name of the speaker, given in all caps, and statement is the speech made during a particular turn.  

In [3]:
def debate_transcript_to_dataframe(fn, speakers):
    lines = open(fn).read().split('\n')
    cur_speaker = None
    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
    transcript = []
    cur_statement = ''
    cur_speaker = None
    for line in lines:
        match = speaker_start_re.match(line)
        if match:
            if match.group(1).startswith('('):
                continue
            if cur_speaker is not None:
                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
            cur_speaker = match.group(1).strip()
            cur_statement = match.group(2).strip() + '\n'
            for other_name in speakers:
                if other_name+':' in cur_statement:
                    cur_statement, other_statement = cur_statement.split(other_name)
                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   
        else:
            cur_statement += line 
    df = pd.DataFrame(transcript)
    return df

## Read debates into Pandas data frames

In [4]:
parties = {'QUIJANO':'Moderator', 
           'KAINE':'Democratic', 
           'PENCE':'Republican', 
           'HOLT':'Moderator', 
           'CLINTON':'Democratic', 
           'TRUMP':'Republican',
           'COOPER':'Moderator',
           'RADDATZ':'Moderator',
           'WALLACE':'Moderator'}

debate_dfs = {}
for info in [
    {'debate': '1st', 'fn': 'presidential-debate-2016-09-26.txt', 'participants': ['TRUMP','CLINTON','HOLT']},
    {'debate': 'VP', 'fn': 'vp-debate-2016-10-04.txt', 'participants': ['PENCE','KAINE','QUIJANO']},
    {'debate': '2nd', 'fn': 'debate-2016-10-09-rush.txt', 'participants': ['TRUMP','CLINTON','COOPER','RADDATZ']},
    {'debate': '3rd', 'fn': 'debate-2016-10-19.txt', 'participants': ['TRUMP','CLINTON','WALLACE']}]:
    cur_df = debate_transcript_to_dataframe(info['fn'], info['participants'])
    cur_df['debate'] = info['debate']
    cur_df['party'] = cur_df['speaker'].apply(lambda x: parties[x])
    cur_df['speaker and debate']=cur_df['speaker'].apply(lambda x: x + ' ' + info['debate'])
    debate_dfs[info['debate']] = cur_df   
df_all = pd.concat(debate_dfs.values())
df_all.iloc[:2]

Unnamed: 0,speaker,statement,debate,party,speaker and debate
0,QUIJANO,Good evening. From Longwood University in Farm...,VP,Moderator,QUIJANO VP
1,QUIJANO,"I'm Elaine Quijano, anchor at CBSN, and corres...",VP,Moderator,QUIJANO VP


## Function to draw scatter plot in notebook. 
Creates a chart from text in a data frame, `df`.  The `category` and `other_category` parameters are the names of the columns we'll compare.  The `category_col` is the column in `df` that contains document categories, and contains `category` and `other_category`.  For example, if `category` is "TRUMP", then `category_col` would be "speaker". `extra` is append to the file name of the html file produced. 

`scores` is an array that 

The function returns an iFrame containing containing the HTML visualization, and as a side-effect writes the visualization to an html file, named `category.lower() + '-' + other_category.lower() + extra + '.html'`.

In [24]:
def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None, singleScoreMode=False, minimum_term_frequency=2):
    html = st.produce_scattertext_explorer(corpus, 
                                           category=category, 
                                           category_name=category.lower() +' Term', 
                                           not_category_name=other_category.lower() + ' Term',
                                           pmi_filter_thresold=2,
                                           minimum_term_frequency=minimum_term_frequency,
                                           metadata=df['speaker and debate'],
                                           scores=scores,
                                           width_in_pixels=1000,
                                           singleScoreMode=singleScoreMode)
    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1200, height=1000)

def draw_plot(df, category, other_category, category_col, extra=''):
    # Scattertext can only do a one column vs. all analysis.  We're excluding any other speakrs
    category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]
    corpus = st.CorpusFromPandas(category_vs_other_df, 
                                 category_col = category_col, 
                                 text_col = 'statement',
                                 nlp = nlp).build()
    return draw_corpus(category_vs_other_df,  corpus, category, other_category, category_col, extra=extra)

# Find the top words used by the candidates in the 3rd debate

In [6]:

category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
debate_3 = st.CorpusFromPandas(data_frame = debate_dfs['3rd'][( debate_dfs['3rd'][category_col] == category) 
                                                              | ( debate_dfs['3rd'][category_col] == other_category)], 
                               category_col = category_col, 
                               text_col = 'statement',
                               nlp = nlp).build()

term_df = debate_3.get_term_freq_df()
term_df['Trump'] = debate_3.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = debate_3.get_scaled_f_scores('CLINTON')

print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Trump top terms
Index(['hillary', 'bad', 'she wants', 'you have', 'the border', 'justices',
       'and she', 'a disaster', 'no idea', 'strong', 'she 's', 'outsmarted',
       'signed', 'percent', 'deals', 'start', 'pay up', 'the baby', 'clinton',
       'strong borders'],
      dtype='object', name='term')
Clinton top terms
Index(['women', 'kind of', 'against', 'that is', 'work', 'stand', 'most',
       'undocumented', 'also', 'guns', 'rights', 'stand up', 'the kind',
       'the debt', 'health', 'security', 'roe', 'v. wade', 'american',
       'million'],
      dtype='object', name='term')


# Plot Clinton vs. Trump word use

In [8]:
draw_plot(df_all, 'CLINTON', 'TRUMP', 'speaker')

In [8]:
draw_plot(df_all, 'KAINE', 'PENCE', 'speaker')

In [53]:
draw_plot(df_all, 'Democratic', 'Republican', 'party')

# Visualize LDA topic model of the debates

## First, create a corpus of all the 2016 debates

In [9]:
corpus = st.CorpusFromPandas(df_all, 
                             category_col = 'party', 
                             text_col = 'statement',
                             nlp = nlp).build()

## Filter out bigrams and stopwords from the corpus, making a new one called `corpus_uni_stop`

In [26]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

# remove bigrams and stopwords
terms_to_ignore = [term for term 
                   in corpus._term_idx_store._i2val
                   if ' ' in term or "'" in term or term in ENGLISH_STOP_WORDS]
corpus_uni_stop = corpus.remove_terms(terms_to_ignore)


## Train two, party-specifc topic models and one general model

In [96]:
lda_models = {}
for party in ['Republican', 'Democratic', 'General']:
    #subset the term-document matrix to only speech from one paraty or aanother
    if party != 'General':
        X = corpus_uni_stop._X[corpus_uni_stop._y == corpus_uni_stop.get_categories().index(party),:]
    else:
        X = corpus_uni_stop._X
    lda_models[party] = (LatentDirichletAllocation(n_topics=20, 
                                                   max_iter=60,
                                                   learning_method='online',
                                                   learning_offset=50.,
                                                   random_state=0)
                         .fit(X))

In [97]:
def top_words_in_topic(model, corpus, topic_idx, n_top_words):
    return [corpus._term_idx_store.getval(i) for i 
            in model.components_[topic_idx].argsort()[:-n_top_words - 1:-1]]

def print_first_five_topics(model):
    for topic_idx, topic in list(enumerate(model.components_))[:5]:
        print("Topic #%d:" % topic_idx)
        print(', '.join(top_words_in_topic(model, corpus_uni_stop, topic_idx, 10)))
        
    print()
print("First Five General Topics")
print_first_five_topics(lda_models['General'])
print("First Five Republican Topics")
print_first_five_topics(lda_models['Republican'])
print("First Five Democratic Topics")
print_first_five_topics(lda_models['Democratic'])


First Five General Topics
Topic #0:
sunny, scranton, fort, wayne, roll, struggle, economy, know, honestly, different
Topic #1:
excuse, birth, read, certificate, blumenthal, fact, book, press, involve, mcclatchy
Topic #2:
defend, governor, pence, mate, strong, running, sean, hannity, believe, role
Topic #3:
oh, yes, puppet, ca, fine, explain, humbling, enormously, debates, commission
Topic #4:
gun, community, reform, police, check, comprehensive, violence, mayor, safe, background

First Five Republican Topics
Topic #0:
absolutely, quote, doubt, true, inaccurate, respectful, suspend, program, bet, yeah
Topic #1:
puppet, excuse, just, border, tax, trump, return, senator, use, nation
Topic #2:
record, recovery, economic, depression, slow, right, great, general, afraid, check
Topic #3:
certificate, birth, blumenthal, hard, mcclatchy, satisfied, reporter, cnn, patti, doyle
Topic #4:
comprehensive, reform, immigration, offend, fbi, path, party, stay, debt, choice

First Five Democratic Topics

In [95]:
import numpy as np
for i, topic in enumerate(lda_models['Democratic'].components_):
    scores = np.sort(scores)[::-1]
#    print(scores[:5])
#    print(np.percentile(scores, .99))

In [98]:
topic_idx = 10
print('Top terms in Dem topic %s' % topic_idx, 
      top_words_in_topic(lda_models['Democratic'], corpus_uni_stop, topic_idx, 10))
scores = lda_models['Democratic'].components_[topic_idx]
scores = np.percentile(np.sort(scores)[::-1], .95)
scores = (scores > 0.4).astype(np.float) * scores
#print(scores)
draw_corpus(clinton_trump_df, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_dem_topic_%s'%(topic_idx), 
            scores = scores,
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in Dem topic 10 ['opinion', 'police', 'safe', 'community', 'governor', 'virginia', 'morning', 'model', 'collapse', 'work']


In [53]:
topic_idx = 5
print('Top terms in Rep topic %s' % topic_idx, 
      top_words_in_topic(lda_models['Republican'], corpus_uni_stop, topic_idx, 10))
draw_corpus(clinton_trump_df, 
            corpus_uni_stop, 
            'Democratic', 
            'Republican', 
            'party', 
            extra='_republican_topic_%s'%(topic_idx), 
            scores=lda_models['Republican'].components_[topic_idx],
            minimum_term_frequency=1,
            singleScoreMode=True)

Top terms in Rep topic 5 ['interrupt', 'let', 'sentence', 'finish', 'ugh', 'right', 'grade', 'school', 'tax', 'raise']


## Find most characteristic terms used by Trump and Clinton

In [24]:
term_df = corpus.get_term_freq_df()

term_df['Trump'] = corpus.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = corpus.get_scaled_f_scores('CLINTON')
print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Trump top terms
Index(['secretary clinton', 'clinton', 'tell you', 'leaving', 'agree',
       'should have', 'but you', 'politicians', 'wrong', 'tremendous', 'i 'll',
       'the war', 'you were', 'the way', 'she does', 'i agree', 'our jobs',
       'i just', 'very very', 'chicago'],
      dtype='object', name='term')
Clinton top terms
Index(['need to', ''ve got', 'information', 'that he', 'justice', 'proposed',
       'national', 'about what', 'people who', 'everyone', 'us to', 'part',
       'incomes', 'we also', 'criminal justice', 'hope', 'both', 'plans',
       'future', 'at home'],
      dtype='object', name='term')


# Comare Clinton and Trump's 1st debate

In [25]:
draw_plot(df, 'CLINTON', 'TRUMP', 'speaker', '1st')


# Comare Clinton and Trump's 2nd debate

In [12]:
draw_plot(df_2nd, 'CLINTON', 'TRUMP', 'speaker', '2nd')

# Compare Trump's 1st and 2nd Debates

In [13]:
draw_plot(df_all, 'TRUMP 1st', 'TRUMP 2nd', 'speaker and debate')

# Compare Clinton's 1st and 2nd Debates

In [14]:
draw_plot(df_all, 'CLINTON 1st', 'CLINTON 2nd', 'speaker and debate')

# Compare Clinton to Kaine

In [15]:
draw_plot(df_all, 'CLINTON', 'KAINE', 'speaker')

# Compare Trump to Pence

In [16]:
draw_plot(df_all, 'TRUMP', 'PENCE', 'speaker')

# The VP Debate

In [17]:
draw_plot(df, 'KAINE', 'PENCE', 'speaker')

# Compare Democrats to Republicans

In [18]:
draw_plot(df_all, 'Democratic', 'Republican', 'party')

# Compare the 1st to the 2nd debate

In [19]:
draw_plot(df_all, '1st', '2nd', 'debate')

In [20]:
draw_plot(df_all, '3rd', '1st', 'debate')