# Crawling State of the Union Adresses from millercenter.org website

In [1]:
# Loading packages

import requests
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import re
import pandas as pd
import numpy as np

## Checking robots.txt

In [2]:
url = "https://millercenter.org/robots.txt"
r = requests.get(url)
robots = BeautifulSoup(r.text, 'html5lib')

print(robots)

<html><head></head><body>#
# robots.txt
#
# This file is to prevent the crawling and indexing of certain parts
# of your site by web crawlers and spiders run by sites like Yahoo!
# and Google. By telling these "robots" where not to go on your site,
# you save bandwidth and server resources.
#
# This file will be ignored unless it is at the root of your host:
# Used:    http://example.com/robots.txt
# Ignored: http://example.com/site/robots.txt
#
# For more information about the robots.txt standard, see:
# http://www.robotstxt.org/robotstxt.html

User-agent: *
# CSS, JS, Images
Allow: /core/*.css$
Allow: /core/*.css?
Allow: /core/*.js$
Allow: /core/*.js?
Allow: /core/*.gif
Allow: /core/*.jpg
Allow: /core/*.jpeg
Allow: /core/*.png
Allow: /core/*.svg
Allow: /profiles/*.css$
Allow: /profiles/*.css?
Allow: /profiles/*.js$
Allow: /profiles/*.js?
Allow: /profiles/*.gif
Allow: /profiles/*.jpg
Allow: /profiles/*.jpeg
Allow: /profiles/*.png
Allow: /profiles/*.svg
# Directories
Disallow: /core/
D

In [3]:
# It is not prohibited to crawl data from /the-presidency/presidential-speeches/
# Therefore we are fine to continue!

In [4]:
# creating directory for storing speeches

cwd =  os.getcwd()
if not os.path.exists(str(cwd) + "/speeches"):
    os.mkdir(str(cwd) + "/speeches")


In [5]:
# setting default download folder for chromedriver
# (do not forget to place the chromedriver in the working directory if not yet done)

chromeOptions = webdriver.ChromeOptions()
preferences = {"download.default_directory" : os.path.join(str(cwd),"/speeches")}
chromeOptions.add_experimental_option("prefs",preferences)

In [6]:
# connecting to browser

browser = webdriver.Chrome(executable_path = os.path.join(os.getcwd(), "chromedriver"), options=chromeOptions) 

In [7]:
# opening the https://millercenter.org/the-presidency/presidential-speeches website in chrome

browser.get("https://millercenter.org/the-presidency/presidential-speeches")

In [8]:
# This website is dynamic (as are the individual presidents sites). The further we scroll down 
# the more links to speeches are shown.

In [9]:
pause = 4 # had some interruptions with less than 3 seconds, with 3 and 4 s I made it to the bottom of the page

# Scrolling step by step automatically down, until the end of the page is reached
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom of the page
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(pause)

    # Calculate new scroll height and compare with previous scroll height
    new_height = browser.execute_script("return document.body.scrollHeight")
    
    if new_height == last_height:
        # Getting the last version of the html code of the site after scrolling finished
        html = browser.page_source
        soup = BeautifulSoup(html, 'html5lib')
        links = [link.get('href') for link in soup.find_all('a')]
        break
        
    last_height = new_height

In [10]:
# Getting all links out of the soup and deleting None's
links = [link.get('href') for link in soup.find_all('a')]
links = list(filter(None, links)) 

In [124]:
# Filtering for 'state-union-address' as part if the link and adding to stateoftheunion list

stateoftheunion = []

for l in links:
    if 'state-union-address' in l or "inaugural-address" in l: 
        stateoftheunion.append("https://millercenter.org"+l)


In [125]:
stateoftheunion

['https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-30-2018-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-20-2017-inaugural-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-20-2015-2015-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-28-2014-2014-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/february-13-2013-2013-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-21-2013-second-inaugural-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-24-2012-2012-state-union-address',
 'https://millercenter.org/the-presidency/presidentia

In [109]:
# Question: why are some years missing? e.g. 2001, 1993, 1981 ? First year of new presidency no sotu speech?

In [110]:
# Next step: crawling / looping over all links and getting the speech text, storing it with some meaningful name
# presidents name + date.txt?

In [129]:
data = []

for link in stateoftheunion:

    r = requests.get(link)
    soup = BeautifulSoup(r.content, "html5lib")

    name = soup.find('p',attrs={"class":"president-name"})
    name = name.get_text("president-name")
    
    date = soup.find('p',attrs={"class":"episode-date"})
    date = date.get_text("episode-date")
    
    # if clause since transcript website is different for Hoover
    
    diff = ["Herbert Hoover", "Calvin Coolidge", "Warren G. Harding", "Abraham Lincoln", "William McKinley",
           "Theodore Roosevelt", "William Taft", "Woodrow Wilson", "William McKinley", "Ulysses S. Grant",
           "Rutherford B. Hayes", "James A. Garfield", "Grover Cleveland", "Benjamin Harrison",
           "Zachary Taylor", "Franklin Pierce", "James K. Polk", "William Harrison", "Martin Van Buren", 
           "Andrew Jackson", "John Quincy Adams", "James Monroe", "James Madison", "Thomas Jefferson",
           "John Adams", "George Washington", "Calvin Coolidge", "James Buchanan", "Franklin Pierce"]
    
    if name in diff:
        speech = soup.find('div',attrs={"class":"view-transcript"})
        speech = speech.get_text("view-transcript")
    else: 
        speech = soup.find('div',attrs={"class":"transcript-inner"})
        speech = speech.get_text("transcript-inner")
    
    if 'state-union-address' in link: type = "State of the Union" 
    else: type = "Inaugural Address"

    data.append([name, date, speech, type])
    
    print(name+" "+ date)

df = pd.DataFrame(data, columns = ['Name', 'Date', "Speech", "Type"])

Donald Trump February 05, 2019
Donald Trump January 30, 2018
Donald Trump January 20, 2017
Barack Obama January 12, 2016
Barack Obama January 20, 2015
Barack Obama January 28, 2014
Barack Obama February 13, 2013
Barack Obama January 21, 2013
Barack Obama January 24, 2012
Barack Obama January 25, 2011
Barack Obama January 27, 2010
Barack Obama January 20, 2009
George W. Bush January 28, 2008
George W. Bush January 23, 2007
George W. Bush January 31, 2006
George W. Bush February 02, 2005
George W. Bush January 20, 2005
George W. Bush January 20, 2004
George W. Bush January 28, 2003
George W. Bush January 29, 2002
George W. Bush January 20, 2001
Bill Clinton January 27, 2000
Bill Clinton January 19, 1999
Bill Clinton January 27, 1998
Bill Clinton February 04, 1997
Bill Clinton January 23, 1996
Bill Clinton January 24, 1995
Bill Clinton January 25, 1994
George H. W. Bush January 28, 1992
George H. W. Bush January 29, 1991
George H. W. Bush January 31, 1990
George H. W. Bush January 20, 198

In [139]:
# Adding political party affiliation to df

party_affil = {
    "Democrat":['Andrew Jackson', 'Martin Van Buren',
                'James Knox Polk','Franklin Pierce',
                'James Buchanan','Grover Cleveland',
               'Woodrow Wilson','Franklin D. Roosevelt',
       'Harry S. Truman','John F. Kennedy',
       'Lyndon B. Johnson','Jimmy Carter', 'Bill Clinton', 'Barack Obama'],
    "Republican":['Abraham Lincoln', 'Ulysses S. Grant',
       'Rutherford B. Hayes', 'James A. Garfield','Benjamin Harrison', 'William McKinley', 'Theodore Roosevelt',
       'William Howard Taft','Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover', 'Dwight D. Eisenhower', 
        'Richard M. Nixon', 'Ronald Reagan', 'George H. W. Bush', 'George W. Bush','Donald Trump', 'Gerald Ford']
    }

def get_party_affil(name):
    for party,names in party_affil.items():
        if name in names:
            return party
        
df["Party"] = df["Name"].apply(lambda x : get_party_affil(x))

In [140]:
df

Unnamed: 0,Name,Date,Speech,Type,Party
0,Donald Trump,"February 05, 2019",\n transcript-innerTranscripttranscript-i...,State of the Union,Republican
1,Donald Trump,"January 30, 2018",\n transcript-innerTranscripttranscript-i...,State of the Union,Republican
2,Donald Trump,"January 20, 2017",\n transcript-innerTranscripttranscript-i...,Inaugural Address,Republican
3,Barack Obama,"January 12, 2016",\n transcript-innerTranscripttranscript-i...,State of the Union,Democrat
4,Barack Obama,"January 20, 2015",\n transcript-innerTranscripttranscript-i...,State of the Union,Democrat
5,Barack Obama,"January 28, 2014",\n transcript-innerTranscripttranscript-i...,State of the Union,Democrat
6,Barack Obama,"February 13, 2013",\n transcript-innerTranscripttranscript-i...,State of the Union,Democrat
7,Barack Obama,"January 21, 2013",\n transcript-innerTranscripttranscript-i...,Inaugural Address,Democrat
8,Barack Obama,"January 24, 2012",\n transcript-innerTranscripttranscript-i...,State of the Union,Democrat
9,Barack Obama,"January 25, 2011",\n transcript-innerTranscripttranscript-i...,State of the Union,Democrat


In [134]:
# Save df to disk

df.to_csv("speeches/data.csv")