In [1]:
import scattertext as st
import re
from pprint import pprint
import pandas as pd
import spacy.en
from IPython.display import IFrame

In [2]:
nlp = spacy.en.English()

In [3]:
def debate_transcript_to_dataframe(fn, speakers):
    lines = open(fn).read().split('\n')
    cur_speaker = None
    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
    transcript = []
    cur_statement = ''
    cur_speaker = None
    for line in lines:
        match = speaker_start_re.match(line)
        if match:
            if match.group(1).startswith('('):
                continue
            if cur_speaker is not None:
                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
            cur_speaker = match.group(1).strip()
            cur_statement = match.group(2).strip() + '\n'
            for other_name in speakers:
                if other_name+':' in cur_statement:
                    cur_statement, other_statement = cur_statement.split(other_name)
                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   
        else:
            cur_statement += line 
    df = pd.DataFrame(transcript)
    return df

In [56]:
parties = {'QUIJANO':'Moderator', 
           'KAINE':'Democratic', 
           'PENCE':'Republican', 
           'HOLT':'Moderator', 
           'CLINTON':'Democratic', 
           'TRUMP':'Republican',
           'COOPER':'Moderator',
           'RADDATZ':'Moderator'}
df = pd.concat([
        debate_transcript_to_dataframe('presidential-debate-2016-09-26.txt', ['TRUMP','CLINTON','HOLT']),
        debate_transcript_to_dataframe('vp-debate-2016-10-04.txt', ['PENCE','KAINE','QUIJANO'])
])
df['party'] = df['speaker'].apply(lambda x: parties[x])
df['speaker and debate']=df['speaker'].apply(lambda x: x + ' 1st')

In [70]:
def draw_plot(df, category, other_category, category_col, extra=''):
    corpus = st.CorpusFromPandas(data_frame = df[(df[category_col] == category) 
                                                              | (df[category_col] == other_category)], 
                                              category_col = category_col, 
                                              text_col = 'statement',
                                              nlp = nlp).build()
    html = st.produce_scattertext_html(corpus, 
                                       category=category, 
                                       category_name=category.lower() +' Term', 
                                       not_category_name=other_category.lower() + ' Term',
                                       protocol='https',
                                       pmi_filter_thresold=2,
                                       minimum_term_frequency=2)
    file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1000, height=1000)

In [24]:
category, other_category, category_col = 'CLINTON', 'TRUMP', 'speaker'
corpus = st.CorpusFromPandas(data_frame = df[(df[category_col] == category) 
                                                              | (df[category_col] == other_category)], 
                                              category_col = category_col, 
                                              text_col = 'statement',
                                              nlp = nlp).build()

In [36]:
term_df = corpus.get_term_freq_df()

term_df['Trump'] = corpus.get_scaled_f_scores('TRUMP')
term_df['Clinton'] = corpus.get_scaled_f_scores('CLINTON')
print(term_df.sort_values(by='Trump', ascending=False).iloc[:20].index)
print(term_df.sort_values(by='Clinton', ascending=False).iloc[:20].index)

Index(['clinton', 'secretary clinton', 'tell you', 'leaving', 'agree',
       'but you', 'i 'll', 'you were', 'politicians', 'the war', 'tremendous',
       'wrong', 'should have', 'the way', 'chicago', 'i just', 'very very',
       'she does', 'our jobs', 'i agree'],
      dtype='object', name='term')
Index(['need to', ''ve got', 'that he', 'information', 'proposed', 'justice',
       'everyone', 'about what', 'national', 'people who', 'us to', 'part',
       'plans', 'too many', 'looked at', 'criminal justice', 'incomes', 'hope',
       'not just', 'the debt'],
      dtype='object', name='term')


In [59]:
df_2nd = pd.concat([
        debate_transcript_to_dataframe('debate-2016-10-09-rush.txt', ['TRUMP','CLINTON','COOPER','RADDATZ'])
])
df_2nd['speaker and debate']=df_2nd['speaker'].apply(lambda x: x + ' 2nd')
df_2nd['party'] = df_2nd['speaker'].apply(lambda x: parties[x])

draw_plot(df_2nd, 'CLINTON', 'TRUMP', 'speaker', '2nd')


In [64]:
df_all = pd.concat([df, df_2nd.dropna()])


In [67]:
df_all

Unnamed: 0,party,speaker,speaker and debate,statement
0,Moderator,HOLT,HOLT 1st,Good evening from Hofstra University in Hempst...
1,Democratic,CLINTON,CLINTON 1st,"How are you, Donald?\n"
2,Moderator,HOLT,HOLT 1st,"Good luck to you.\nWell, I don't expect us to ..."
3,Democratic,CLINTON,CLINTON 1st,"Well, thank you, Lester, and thanks to Hofstra..."
4,Democratic,CLINTON,CLINTON 1st,I also want to see more companies do profit-sh...
5,Moderator,HOLT,HOLT 1st,"Secretary Clinton, thank you.\nMr. Trump, the ..."
6,Republican,TRUMP,TRUMP 1st,"Thank you, Lester. Our jobs are fleeing the co..."
7,Republican,TRUMP,TRUMP 1st,"We cannot let it happen. Under my plan, I'll b..."
8,Moderator,HOLT,HOLT 1st,"Secretary Clinton, would you like to respond?\n"
9,Democratic,CLINTON,CLINTON 1st,"Well, I think that trade is an important issue..."


In [72]:
draw_plot(df, 'CLINTON 1st', 'TRUMP 1st', 'speaker and debate')

In [76]:
draw_plot(df_2nd, 'CLINTON 2nd', 'TRUMP 2nd', 'speaker and debate')

In [69]:
draw_plot(df_all, 'TRUMP 1st', 'TRUMP 2nd', 'speaker and debate')

In [73]:
draw_plot(df_all, 'CLINTON 1st', 'CLINTON 2nd', 'speaker and debate')

In [7]:
draw_plot(df, 'CLINTON', 'KAINE', 'speaker')

In [21]:
draw_plot(df, 'TRUMP', 'PENCE', 'speaker')

In [22]:
draw_plot(df, 'KAINE', 'PENCE', 'speaker')

In [23]:
draw_plot(df, 'Democratic', 'Republican', 'party')