Import libraries.


In [1]:
import pandas as p
import numpy as n
import string
import re
from sklearn.feature_extraction.text import CountVectorizer


Create user-defined functions.

extract_speeches('speaker')
Extracts all of a speaker's speeches as a single string.

clean_text(text)
Make text lowercase, remove text in square brackets, remove punctuation, and remove words containing numbers.


In [2]:
def extract_speeches(speaker):
    return data0.loc[data.speaker.eq(speaker)].speech.str.cat(sep = " ")

def clean_text(text):
    text = text.lower()
    text = re.sub(
    "\[.*?\]",
    "",
    text)
    text = re.sub(
    "[%s]" % re.escape(string.punctuation),
    "",
    text)
    text = re.sub(
    "[“”’]",
    "",
    text)
    text = re.sub(
    "\w*\d\w*",
    "",
    text)
    return text
    

Import data.


In [3]:
data = p.read_csv("data/data.csv", encoding = "cp1252")

Print out column names and sample of data.


In [4]:
print(data.columns)
print(data.head)

Index(['date', 'debate_name', 'debate_section', 'speaker', 'speech',
       'speaking_time_seconds'],
      dtype='object')
<bound method NDFrame.head of             date                                        debate_name  \
0     02-25-2020  South Carolina Democratic Debate Transcript: F...   
1     02-25-2020  South Carolina Democratic Debate Transcript: F...   
2     02-25-2020  South Carolina Democratic Debate Transcript: F...   
3     02-25-2020  South Carolina Democratic Debate Transcript: F...   
4     02-25-2020  South Carolina Democratic Debate Transcript: F...   
5     02-25-2020  South Carolina Democratic Debate Transcript: F...   
6     02-25-2020  South Carolina Democratic Debate Transcript: F...   
7     02-25-2020  South Carolina Democratic Debate Transcript: F...   
8     02-25-2020  South Carolina Democratic Debate Transcript: F...   
9     02-25-2020  South Carolina Democratic Debate Transcript: F...   
10    02-25-2020  South Carolina Democratic Debate Transcript: F.

Print out a list of all speakers.


In [5]:
n.unique(data.speaker)

array(['A. Cooper', 'Abby Phillips', 'Adam Sexton', 'Amna Nawaz',
       'Amy Klobuchar', 'Amy Walter', 'Anderson Cooper',
       'Andrea Mitchell', 'Andrew Yang', 'Announcer', 'Ashley Parker',
       'Audience', 'B. Pfannenstiel', 'Bennett', 'Bernie Sanders',
       'Beto O’Rourke', 'Bill Whitaker', 'Bill de Blasio', 'Brianne P.',
       'Chuck Todd', 'Cory Booker', 'Crowd', 'Dana Bash', 'David Muir',
       'Devin Dwyer', 'Diana', 'Don Lemon', 'Elizabeth Warren',
       'Eric Stalwell', 'Eric Swalwell', 'Erin Burnett', 'Female',
       'Gayle King', 'George S.', 'Hallie Jackson', 'Helen',
       'J. Hickenlooper', 'Jake Tapper', 'Jay Inslee', 'Joe Biden',
       'John Delaney', 'John H.', 'John Hickenloop', 'John King',
       'Jon Ralston', 'Jorge Ramos', 'Jose D.B.', 'Judy Woodruff',
       'Julian Castro', 'Kamala Harris', 'Kirseten Gillibrand',
       'Kirsten Gillibrand', 'Kristen Welker', 'Lester Holt',
       'Linsey Davis', 'Major Garrett', 'Male', 'Marc Lacey',
       'Marga

Filter the data set to include only selected speakers.


In [6]:
speakers = {
        'Amy Klobuchar', 'Andrew Yang', 'Bernie Sanders', 'Beto O’Rourke', 'Cory Booker', 'Elizabeth Warren',
        'Joe Biden', 'Julian Castro', 'Kamala Harris', 'Marianne Williamson', 'Michael Bloomberg',
        'Pete Buttigieg', 'Tom Steyer', 'Tulsi Gabbard'
    }

data0 = data.loc[
    data.speaker.isin(speakers)
]
print(data.shape)
print(data0.shape)

(5911, 6)
(3086, 6)


Test the extract_speeches function.


In [7]:
extract_speeches('Beto O’Rourke')

'You know, I think about everyone who’s ever served this country in uniform. We have two examples here on the stage tonight, Mayor Buttigieg and Congresswoman Gabbard. Those who have willingly sacrificed their lives to defend this country and our constitution. We are the inheritors of their service and their sacrifice and we have a responsibility to be fearless in the face of this President’s criminality and his lawlessness. The fact that as a candidate for the highest office in the land, he invited the participation, the invasion of a foreign power in our democracy. As president, he lied to investigators, obstructed justice, fired James Comey, head of the F.B.I., tried to fire Mueller, head of the investigation. Then invited President Zelensky to involve himself in our politics, as well as China in exchange for favorable trade terms in an upcoming trade deal. If you do not hold them to account, if there is not justice, not only have we failed this moment, our constitution and our coun

Create a new data frame with only the selected speaker's names
and, for each speaker, a single string containing all of his/her speeches.


In [8]:
data1 = p.DataFrame({
    'speaker': list(speakers)
})
data1['speeches'] = ""
data1['speaking_time'] = 0
for speaker in speakers:
    data1.loc[data1.speaker.eq(speaker), 'speeches'] = extract_speeches(speaker)
    data1.loc[data1.speaker.eq(speaker), 'speaking_time'] = data.loc[data.speaker.eq(speaker)].speaking_time_seconds.sum()
data1


Unnamed: 0,speaker,speeches,speaking_time
0,Amy Klobuchar,"Yes, and I think that what we need to do inste...",7703.0
1,Andrew Yang,"First, let me say America, it’s great to be ba...",3149.0
2,Marianne Williamson,Thank you. In 1776 our founders brought forth ...,827.0
3,Tulsi Gabbard,That our democratic party unfortunately is not...,1690.0
4,Bernie Sanders,"Well, you’re right, the economy is doing reall...",9389.0
5,Pete Buttigieg,We know what the President … what Russia wants...,8413.0
6,Kamala Harris,"Well first of all, we have a criminal living i...",3999.0
7,Michael Bloomberg,Senator- I think that Donald Trump thinks it w...,1613.0
8,Cory Booker,"Well, first of all, I think we all agree that ...",3068.0
9,Joe Biden,"We talk about progressive, let’s talk about be...",9310.0


Apply the first text cleaning function to the candidates' speeches.


In [12]:
data2 = data1
data2.speeches = data2.speeches.apply(clean_text)
data2.to_pickle('corpus.pkl')
print(data2)
print(data2.speeches[0])
data2.to_pickle('data.pkl')


                speaker                                           speeches  \
0         Amy Klobuchar  yes and i think that what we need to do instea...   
1           Andrew Yang  first let me say america its great to be back ...   
2   Marianne Williamson  thank you in  our founders brought forth on th...   
3         Tulsi Gabbard  that our democratic party unfortunately is not...   
4        Bernie Sanders  well youre right the economy is doing really g...   
5        Pete Buttigieg  we know what the president … what russia wants...   
6         Kamala Harris  well first of all we have a criminal living in...   
7     Michael Bloomberg  senator i think that donald trump thinks it wo...   
8           Cory Booker  well first of all i think we all agree that we...   
9             Joe Biden  we talk about progressive lets talk about bein...   
10        Beto O’Rourke  you know i think about everyone whos ever serv...   
11     Elizabeth Warren  look the way i see this is that bernie 

In [10]:
cv = CountVectorizer(stop_words = 'english')
data2_cv = cv.fit_transform(data2.speeches)
data2_dtm = p.DataFrame(
    data2_cv.toarray(),
    columns = cv.get_feature_names()
)
data2_dtm.index = data2.speaker
data2_dtm.to_pickle('dtm.pkl')
data2_dtm


Unnamed: 0_level_0,aa,aapi,aaron,abandoned,abandoning,abated,abc,aberration,abhorrent,abide,...,youth,youtube,youve,zealand,zelensky,zero,zeroed,zip,zone,zuckerberg
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Amy Klobuchar,1,0,0,0,0,0,0,0,0,0,...,0,0,10,0,0,0,0,0,0,0
Andrew Yang,0,0,0,0,0,0,0,0,0,0,...,0,0,5,0,0,11,0,1,0,1
Marianne Williamson,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0
Tulsi Gabbard,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
Bernie Sanders,0,0,0,0,0,0,1,0,0,0,...,1,2,6,0,0,3,0,1,0,0
Pete Buttigieg,0,0,0,1,0,0,0,1,0,0,...,0,0,6,1,0,5,0,0,2,0
Kamala Harris,0,0,0,1,0,0,0,0,0,0,...,0,0,3,0,0,0,0,0,0,0
Michael Bloomberg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Cory Booker,0,0,0,0,1,0,0,0,0,0,...,0,0,3,0,0,1,0,0,0,0
Joe Biden,0,0,0,0,0,1,0,1,1,0,...,0,0,8,0,0,2,1,1,3,0


In [11]:
data2_dtm.transpose()

speaker,Amy Klobuchar,Andrew Yang,Marianne Williamson,Tulsi Gabbard,Bernie Sanders,Pete Buttigieg,Kamala Harris,Michael Bloomberg,Cory Booker,Joe Biden,Beto O’Rourke,Elizabeth Warren,Julian Castro,Tom Steyer
aa,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aapi,0,0,0,0,0,0,0,0,0,0,0,0,0,1
aaron,0,0,0,0,0,0,0,0,0,0,0,0,1,0
abandoned,0,0,0,0,0,1,1,0,0,0,0,0,1,1
abandoning,0,0,0,0,0,0,0,0,1,0,0,0,0,0
abated,0,0,0,0,0,0,0,0,0,1,0,0,0,0
abc,0,0,0,0,1,0,0,0,0,0,0,0,0,0
aberration,0,0,0,0,0,1,0,0,0,1,0,0,0,0
abhorrent,0,0,0,1,0,0,0,0,0,1,0,0,0,0
abide,0,0,0,0,0,0,0,0,0,0,0,1,0,0
