In [1]:
import scattertext as st
import re
from pprint import pprint
import pandas as pd
import spacy.en
from IPython.display import IFrame

In [2]:
nlp = spacy.en.English()

In [3]:
def debate_transcript_to_dataframe(fn, speakers):
    lines = open(fn).read().split('\n')
    cur_speaker = None
    speaker_start_re = re.compile(r'^([(]?[A-Z][A-Z][A-Z]+):?(.+)$')
    transcript = []
    cur_statement = ''
    cur_speaker = None
    for line in lines:
        match = speaker_start_re.match(line)
        if match:
            if match.group(1).startswith('('):
                continue
            if cur_speaker is not None:
                transcript.append({'speaker': cur_speaker, 'statement': cur_statement})
            cur_speaker = match.group(1).strip()
            cur_statement = match.group(2).strip() + '\n'
            for other_name in speakers:
                if other_name+':' in cur_statement:
                    cur_statement, other_statement = cur_statement.split(other_name)
                    transcript.append({'speaker': cur_speaker, 'statement': cur_statement.strip()})
                    transcript.append({'speaker': other_name, 'statement': other_statement.strip()})   
        else:
            cur_statement += line 
    df = pd.DataFrame(transcript)
    return df

In [4]:
parties = {'QUIJANO':'Moderator', 
           'KAINE':'Democratic', 
           'PENCE':'Republican', 
           'HOLT':'Moderator', 
           'CLINTON':'Democratic', 
           'TRUMP':'Republican'}
df = pd.concat([
        debate_transcript_to_dataframe('presidential-debate-2016-09-26.txt', ['TRUMP','CLINTON','HOLT']),
        debate_transcript_to_dataframe('vp-debate-2016-10-04.txt', ['PENCE','KAINE','QUIJANO'])
])
df['party'] = df['speaker'].apply(lambda x: parties[x])

In [8]:
def draw_plot(df, category, other_category, category_col):
    corpus = st.CorpusFromPandas(data_frame = df[(df[category_col] == category) 
                                                              | (df[category_col] == other_category)], 
                                              category_col = category_col, 
                                              text_col = 'statement',
                                              nlp = nlp).build()
    html = st.produce_scattertext_html(corpus, 
                                       category=category, 
                                       category_name=category.title() +' Term', 
                                       not_category_name=other_category.title() + ' Term',
                                       protocol='https',
                                       pmi_filter_thresold=2,
                                       minimum_term_frequency=2)
    file_name = category.lower() + '-' + other_category.lower() + '.html'
    open(file_name, 'wb').write(html.encode('utf-8'))
    return IFrame(src=file_name, width = 1000, height=1000)

In [9]:
draw_plot(df, 'CLINTON', 'TRUMP', 'speaker')

In [7]:
draw_plot(df, 'CLINTON', 'KAINE', 'speaker')

In [21]:
draw_plot(df, 'TRUMP', 'PENCE', 'speaker')

In [22]:
draw_plot(df, 'KAINE', 'PENCE', 'speaker')

In [23]:
draw_plot(df, 'Democratic', 'Republican', 'party')