In [1]:
import scattertext as st
import re
from pprint import pprint
import pandas as pd
import spacy.en
from IPython.display import IFrame



In [2]:
nlp = spacy.en.English()

# Parse debates and create plotting interface

In [3]:
def debate_transcript_to_dataframe(fn, speakers):
    lines = open(fn).read().split('\n')
    cur_speaker = None
    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
    transcript = []
    cur_statement = ''
    cur_speaker = None
    for line in lines:
        match = speaker_start_re.match(line)
        if match:
            if match.group(1).startswith('('):
                continue
            if cur_speaker is not None:
                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
            cur_speaker = match.group(1).strip()
            cur_statement = match.group(2).strip() + '\n'
            for other_name in speakers:
                if other_name+':' in cur_statement:
                    cur_statement, other_statement = cur_statement.split(other_name)
                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   
        else:
            cur_statement += line 
    df = pd.DataFrame(transcript)
    return df

In [4]:
parties = {'QUIJANO':'Moderator', 
           'KAINE':'Democratic', 
           'PENCE':'Republican', 
           'HOLT':'Moderator', 
           'CLINTON':'Democratic', 
           'TRUMP':'Republican',
           'COOPER':'Moderator',
           'RADDATZ':'Moderator',
           'WALLACE':'Moderator'}
df_1st = debate_transcript_to_dataframe('presidential-debate-2016-09-26.txt', ['TRUMP','CLINTON','HOLT'])
df_vp = debate_transcript_to_dataframe('vp-debate-2016-10-04.txt', ['PENCE','KAINE','QUIJANO'])
df_1st['debate'] = '1st'
df_vp['debate'] = 'VP'
df = pd.concat([df_1st, df_vp])
df['party'] = df['speaker'].apply(lambda x: parties[x])
df['speaker and debate']=df['speaker'].apply(lambda x: x + ' 1st')
df_2nd = debate_transcript_to_dataframe('debate-2016-10-09-rush.txt', ['TRUMP','CLINTON','COOPER','RADDATZ'])
df_2nd['speaker and debate']=df_2nd['speaker'].apply(lambda x: x + ' 2nd')
df_2nd['party'] = df_2nd['speaker'].apply(lambda x: parties[x])
df_2nd['debate'] = '2nd'
df_3rd = debate_transcript_to_dataframe('debate-2016-10-19.txt', ['TRUMP','CLINTON','WALLACE'])
df_3rd['speaker and debate']=df_3rd['speaker'].apply(lambda x: x + ' 3rd')
df_3rd['party'] = df_3rd['speaker'].apply(lambda x: parties[x])
df_3rd['debate'] = '3rd'
df_all = pd.concat([df, df_2nd, df_3rd])


In [5]:
def draw_plot(df, category, other_category, category_col, extra=''):
    corpus = st.CorpusFromPandas(data_frame = df[(df[category_col] == category) 
                                                              | (df[category_col] == other_category)], 
                                              category_col = category_col, 
                                              text_col = 'statement',
                                              nlp = nlp).build()
    html = st.produce_scattertext_html(corpus, 
                                       category=category, 
                                       category_name=category.lower() +' Term', 
                                       not_category_name=other_category.lower() + ' Term',
                                       protocol='https',
                                       pmi_filter_thresold=2,
                                       minimum_term_frequency=2,
                                       width=1000)
    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1200, height=1000)

In [6]:
category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
corpus = st.CorpusFromPandas(data_frame = df[(df[category_col] == category) 
                                                              | (df[category_col] == other_category)], 
                                              category_col = category_col, 
                                              text_col = 'statement',
                                              nlp = nlp).build()

In [7]:
category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
debate_3 = st.CorpusFromPandas(data_frame = df_3rd[(df_3rd[category_col] == category) 
                                                              | (df_3rd[category_col] == other_category)], 
                                              category_col = category_col, 
                                              text_col = 'statement',
                                              nlp = nlp).build()

# Find the top words used by the candidates in the 3rd debate

In [8]:
term_df = debate_3.get_term_freq_df()

term_df['Trump'] = debate_3.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = debate_3.get_scaled_f_scores('CLINTON')
print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Trump top terms
Index(['hillary', 'bad', 'she wants', 'you have', 'the border', 'and she',
       'justices', 'signed', 'no idea', 'percent', 'deals', 'strong',
       'a disaster', 'outsmarted', 'she 's', '$ 6', '6', 'have no', 'the baby',
       'baby'],
      dtype='object', name='term')
Clinton top terms
Index(['women', 'kind of', 'against', 'that is', 'stand', 'work',
       'undocumented', 'also', 'most', 'stand up', 'the debt', 'rights',
       'guns', 'the kind', 'immigrants', 'million', 'against it', 'security',
       'v. wade', 'opportunities'],
      dtype='object', name='term')


In [9]:
draw_plot(df_3rd, 'CLINTON', 'TRUMP', 'speaker', 'Third Debate')


## Find most characteristic terms used by Trump and Clinton

In [10]:
term_df = corpus.get_term_freq_df()

term_df['Trump'] = corpus.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = corpus.get_scaled_f_scores('CLINTON')
print('Trump top terms')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print('Clinton top terms')
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Trump top terms
Index(['secretary clinton', 'clinton', 'tell you', 'leaving', 'agree',
       'should have', 'i 'll', 'you were', 'politicians', 'the war', 'wrong',
       'but you', 'tremendous', 'the way', 'our jobs', 'i just', 'she does',
       'very very', 'chicago', 'i agree'],
      dtype='object', name='term')
Clinton top terms
Index(['need to', ''ve got', 'that he', 'information', 'proposed', 'justice',
       'national', 'everyone', 'people who', 'us to', 'about what', 'part',
       'at home', 'not just', 'hope', 'incomes', 'kinds', 'the debt',
       'criminal', 'looked at'],
      dtype='object', name='term')


# Comare Clinton and Trump's 1st debate

In [11]:
draw_plot(df, 'CLINTON', 'TRUMP', 'speaker', '1st')


# Comare Clinton and Trump's 2nd debate

In [12]:
draw_plot(df_2nd, 'CLINTON', 'TRUMP', 'speaker', '2nd')

# Compare Trump's 1st and 2nd Debates

In [13]:
draw_plot(df_all, 'TRUMP 1st', 'TRUMP 2nd', 'speaker and debate')

# Compare Clinton's 1st and 2nd Debates

In [14]:
draw_plot(df_all, 'CLINTON 1st', 'CLINTON 2nd', 'speaker and debate')

# Compare Clinton to Kaine

In [15]:
draw_plot(df_all, 'CLINTON', 'KAINE', 'speaker')

# Compare Trump to Pence

In [16]:
draw_plot(df_all, 'TRUMP', 'PENCE', 'speaker')

# The VP Debate

In [17]:
draw_plot(df, 'KAINE', 'PENCE', 'speaker')

# Compare Democrats to Republicans

In [18]:
draw_plot(df_all, 'Democratic', 'Republican', 'party')

# Compare the 1st to the 2nd debate

In [19]:
draw_plot(df_all, '1st', '2nd', 'debate')

In [20]:
draw_plot(df_all, '3rd', '1st', 'debate')