In [1]:
# Jesse Galef
# July 2016


In [2]:
import pandas as pd

In [3]:
from bs4 import BeautifulSoup
import urllib

In [4]:
import re as re

In [5]:
# Personid:
# 20967 - Trump
# 19027 - Clinton
# 994 - Sanders
# 4776 - jeb bush
# 47822 - ben carson
# 1007174 - chris christie
# 1019953 - ted cruz
# 1620 - john kasich
# 24776 - huckabee
# 9265241 - rand paul
# 87599 - marco rubio

# seriesid[]=91 for C-SPAN's "Campaign 2016" tag


# Automate scraping

In [6]:
def is_person(name, name_yes, name_no):
    # name is a string, name_yes and name_no are lists of strings
    # if any of the 'name_yes' strings and none of the name_no strings appear in name, return True. Else False
    return any([yes.lower() in name.lower() for yes in name_yes]) and not any([no.lower() in name.lower() for no in name_no])

def id_to_df(person_id, person_name, name_yes, name_no):
    # passed a person_id, name you're looking for, and name_yes/name_no (see above)
    # returns a dataframe of transcripts scraped from C-SPAN's Campaign 2016-tagged videos
    # Compiles the date of the transcript, title, a list of names of all people who talked in the transcript, and the url
    
    url = 'http://www.c-span.org/search/?sdate=01/01/2015&edate=12/31/2016&searchtype=Videos&sort=Most+Recent+Airing&text=1&seriesid[]=91&personid[]='
    url = url + str(person_id)
    url = url + '&show100=&sdate=01/01/2015&edate=12/31/2016&searchtype=Videos&sort=Most+Recent+Airing&text=0&seriesid[]=91&personid[]='
    url = url + str(person_id)
    url = url + '&ajax&page='

    addon_url = "&action=getTranscript&transcriptType=cc&"

    video_page = BeautifulSoup(urllib.urlopen(url).read(), 'lxml')
    video_list = video_page.find_all('li', class_='onevid')
    print len(video_list)
    # While there are more videos to load, add the li tags from the next page
    page = 1
    while video_page.find(id='loadmore') != None:
        page += 1
        more_url = url + str(page)
        video_page = BeautifulSoup(urllib.urlopen(more_url).read(), 'lxml')
        video_list = video_list + video_page.find_all('li', class_='onevid')
    print len(video_list)
    info = [] # list of dicts that will be compiled into a dataframe

    for count, video in enumerate(video_list):    
        print 'video ',count,' of ',len(video_list)
        date = video.find('time')['datetime']
        link = video.find('a', class_='title')['href']

        if link[:2] == '//':
            link = 'http:' + link # Recently C-SPAN's links lost the 'http:', becoming '//www.' etc. I assume a glitch?

        title = video.find('h3').text
    
        full_text = ""
        names = [] # names of all people who talked in the transcript
#         print str(link)+str(addon_url)
        transcript_page = BeautifulSoup(urllib.urlopen(link+addon_url).read(), 'lxml')
        rows = transcript_page.find_all('tr')
        for row in rows:
            cells = row.find_all('td')
            for cell in cells:
                name = cell.find('strong')
                if name != None: # this cell has an identified speaker
                    names.append(name.text.lower())
                    
                    if is_person(name.text, name_yes, name_no):
                        text_cell = cell.find('p', class_='short_transcript')
                        if text_cell != None: # There's the occasional empty cell even with a speaker
                            text = text_cell.text
                            colon_pos = text[0:40].find(':')
                            if colon_pos != -1:
                                # Found a colon in the first 40 characters of this block, remove the name
                                text = text[colon_pos+1:]
                            if cell.find('span', class_='hidden-full-transcript-ellipses') != None:
                                text = text[:-3] # remove ellipses
                            text = re.sub('(\[[^\]]*\])', '', text) # remove things like [applause] and [cheers]
                            text = re.sub('(\([^\)]*\))', '', text) # remove things like (music)
                                                                    # Although unfortunately doesn't remove the lyrics themselves...
                            text = text.replace('\n',' ')
                            full_text = full_text + " " + text     
        info.append(
        {'title': title,
         'link': link,
         'date': date,
         'names': names,
         'text': full_text,
         'speaker': person_name
            }
        )
    return pd.DataFrame(info)
    
    

In [7]:
candidates = [
    {'pid':19027,'name':'Clinton','name_yes':['hillary','clinton'],'name_no':['bill','william','chelsea']},
    {'pid':20967,'name':'Trump','name_yes':['donald','trump'],'name_no':['eric','melania','ivanka','jr']},
#     {'pid':994,'name':'Sanders','name_yes':['bernie','bernard','sanders'],'name_no':['jane','levi']},
#     {'pid':4776,'name':'Bush','name_yes':['jeb','bush'],'name_no':['george','barbara']},
#     {'pid':47822,'name':'Carson','name_yes':['ben','carson'],'name_no':['candy','jr']},
#     {'pid':1007174,'name':'Christie','name_yes':['chris','christie'],'name_no':['patrick','andrew','sarah','bridget']},
#     {'pid':1019953,'name':'Cruz','name_yes':['ted','cruz'],'name_no':['heidi','rafael']},
#     {'pid':1620,'name':'Kasich','name_yes':['kasich'],'name_no':['karen','emma','reese']},
#     {'pid':87599,'name':'Rubio','name_yes':['marco','rubio'],'name_no':['jeanette']},
    
]

In [8]:
import time
t0 = time.time()
cand_df = {}
for candidate in candidates:
    cand_df[candidate['name']] = id_to_df(candidate['pid'], candidate['name'], candidate['name_yes'], candidate['name_no'])
    cand_df[candidate['name']] = cand_df[candidate['name']][cand_df[candidate['name']].text != '']
    print candidate['name'], time.time()-t0
    t0=time.time()


100
147
video  0  of  147
video  1  of  147
video  2  of  147
video  3  of  147
video  4  of  147
video  5  of  147
video  6  of  147
video  7  of  147
video  8  of  147
video  9  of  147
video  10  of  147
video  11  of  147
video  12  of  147
video  13  of  147
video  14  of  147
video  15  of  147
video  16  of  147
video  17  of  147
video  18  of  147
video  19  of  147
video  20  of  147
video  21  of  147
video  22  of  147
video  23  of  147
video  24  of  147
video  25  of  147
video  26  of  147
video  27  of  147
video  28  of  147
video  29  of  147
video  30  of  147
video  31  of  147
video  32  of  147
video  33  of  147
video  34  of  147
video  35  of  147
video  36  of  147
video  37  of  147
video  38  of  147
video  39  of  147
video  40  of  147
video  41  of  147
video  42  of  147
video  43  of  147
video  44  of  147
video  45  of  147
video  46  of  147
video  47  of  147
video  48  of  147
video  49  of  147
video  50  of  147
video  51  of  147
video  52  of 

In [9]:
for cand in cand_df:
    cand_df[cand].to_csv(cand+"_transcript_df.csv")