In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import scattertext as st
import spacy
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

# Scrape and parse the debate transcript from rev.com

In [2]:
url = 'https://www.rev.com/blog/transcripts/donald-trump-joe-biden-1st-presidential-debate-transcript-2020'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
data = []
for p in soup.find('div', attrs={'class': 'fl-callout-content'}).findAll('p'):
    raw = p.text.strip()
    if raw == '': continue
    content = '\n'.join(raw.split('\n')[1:])
    speaker = raw.split('\n')[0].split(':')[0]
    time = raw.split('\n')[0].split(':', 1)[1].strip()
    time = time[1:-1]
    if len(time) == 5: 
        time = '00:' + time
    data.append({'Speaker': speaker, 'Time': time, 'Text': content})
df = pd.DataFrame(data).assign(
    Name = lambda df: df['Speaker'].apply(lambda x: {'P': 'Trump', 'V': 'Biden', 'C': 'Wallace'}.get(x[0])),
    Parse = lambda df: df.Text.apply(spacy.load('en'))
)

# Comparison of lemmas used by President Trump and Vice President Biden
The chart below plots lemmas (bsse word forms) used by the candidates. The higher a lemma is on the plot, the more 
it was used by Vice President Biden, and the further to the right, the more it was used by President Trump. Click 
a word or search the chart to see how it was used in context.

We can see that Vice President Biden uses the word Vote, particularly in the imperative mood, much more than 
President Trump. He speaks in terms of truth, using forms of the word "fact" "true", and "lie" much more than President
Trump. He also speaks much more about the Coronavirus and COVID than the president.

President Trump refers to the Vice President as "Joe" disproportionately, while Vice President Biden only used "Donald"
once. Instead, Vice President Biden simply refers to President Trump as "he". Perhaps the former Vice President is 
following the old political rule of never naming one's opponent.

In [3]:
corpus = st.CorpusFromParsedDocuments(
    df[df.Name.isin(['Biden','Trump'])],
    category_col='Name',
    parsed_col='Parse',
    feats_from_spacy_doc=st.FeatsFromSpacyDoc(use_lemmas=True)
).build().get_stoplisted_unigram_corpus()

html = st.produce_scattertext_explorer(
    corpus,
    category='Biden',
    not_category_name='Trump',
    minimum_term_frequency=0, 
    pmi_threshold_coefficient=0,
    width_in_pixels=1000, 
    metadata=corpus.get_df()['Speaker'] + ': ' + corpus.get_df()['Time'],
    transform=st.Scalers.dense_rank,
    use_full_doc=True

)
fn = 'bidenvstrump.html'
open(fn, 'wb').write(('<h2>Lemmas used by Biden and Trump in the First Debate</h2>' + html).encode('utf-8'))
IFrame(src=fn, width = 1300, height=700)

# Comparison of words used by President Trump and Vice President Biden
Filling in the gaps from the above example, we can see Vice President Biden uses "he" and especially "his" far more 
than President Trump. 

President Trump uses the word "I" far more than the former VP, by about a 2:1 ratio, which psycholinguits (Kacewicz 2014) have found
to be a mark of lower status.

Kacewicz, E., Pennebaker, J. W., Davis, M., Jeon, M., & Graesser, A. C. (2014). Pronoun use reflects standings in social hierarchies. Journal of Language and Social Psychology, 33(2), 125–143. https://doi.org/10.1177/0261927X13502654

In [4]:
corpus = st.CorpusFromParsedDocuments(
    df[df.Name.isin(['Biden','Trump'])],
    category_col='Name',
    parsed_col='Parse',
).build().get_unigram_corpus()

html = st.produce_scattertext_explorer(
    corpus,
    category='Biden',
    not_category_name='Trump',
    minimum_term_frequency=0, 
    pmi_threshold_coefficient=0,
    width_in_pixels=1000, 
    metadata=corpus.get_df()['Speaker'] + ': ' + corpus.get_df()['Time'],
    transform=st.Scalers.dense_rank,
    use_full_doc=True

)
open('bidenvstrumpwords.html', 'w').write(html)
open(fn, 'wb').write(('<h2>Words used by Biden and Trump in the First Debate</h2>' + html).encode('utf-8'))
IFrame(src=fn, width = 1300, height=700)

In [5]:
corpus = st.CorpusFromParsedDocuments(
    df,
    category_col='Name',
    parsed_col='Parse',
    feats_from_spacy_doc=st.FeatsFromSpacyDoc(use_lemmas=True)    
).build().get_stoplisted_unigram_corpus()

semiotic_square = st.SemioticSquare(
    corpus,
    category_a='Biden',
    category_b='Trump',
    neutral_categories=['Wallace'],
    scorer=st.RankDifference(),
    labels={'not_a_and_not_b': 'Wallace',
            'a_and_b': 'Candidates',
            'a_and_not_b': 'Biden',
            'b_and_not_a': 'Trump',
            'a':'',
            'b':'',
            'not_a':'',
            'not_b':''}
)

html = st.produce_semiotic_square_explorer(semiotic_square,
                                           category_name='Biden',
                                           not_category_name='Trump',
                                           x_label='Biden-Trump',
                                           y_label='Candidate-Moderator',
                                           num_terms_semiotic_square=5,
                                           neutral_category_name='Wallace',
                                           metadata=df['Speaker'] + ' ' + df['Time'])

fn = 'demo_semiotic.html'
open(fn, 'wb').write(html.encode('utf-8'))
IFrame(src=fn, width = 1300, height=700)

In [6]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted First Debate.ipynb.
Converted index.ipynb.
