# Crawling State of the Union Adresses from millercenter.org website

In [1]:
# Loading packages

import requests
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import re
import pandas as pd
import numpy as np

## Checking robots.txt

In [2]:
url = "https://millercenter.org/robots.txt"
r = requests.get(url)
robots = BeautifulSoup(r.text, 'html5lib')

print(robots)

<html><head></head><body>#
# robots.txt
#
# This file is to prevent the crawling and indexing of certain parts
# of your site by web crawlers and spiders run by sites like Yahoo!
# and Google. By telling these "robots" where not to go on your site,
# you save bandwidth and server resources.
#
# This file will be ignored unless it is at the root of your host:
# Used:    http://example.com/robots.txt
# Ignored: http://example.com/site/robots.txt
#
# For more information about the robots.txt standard, see:
# http://www.robotstxt.org/robotstxt.html

User-agent: *
# CSS, JS, Images
Allow: /core/*.css$
Allow: /core/*.css?
Allow: /core/*.js$
Allow: /core/*.js?
Allow: /core/*.gif
Allow: /core/*.jpg
Allow: /core/*.jpeg
Allow: /core/*.png
Allow: /core/*.svg
Allow: /profiles/*.css$
Allow: /profiles/*.css?
Allow: /profiles/*.js$
Allow: /profiles/*.js?
Allow: /profiles/*.gif
Allow: /profiles/*.jpg
Allow: /profiles/*.jpeg
Allow: /profiles/*.png
Allow: /profiles/*.svg
# Directories
Disallow: /core/
D

In [3]:
# It is not prohibited to crawl data from /the-presidency/presidential-speeches/
# Therefore we are fine to continue!

In [4]:
# creating directory for storing speeches

cwd =  os.getcwd()
if not os.path.exists(str(cwd) + "/speeches"):
    os.mkdir(str(cwd) + "/speeches")


In [5]:
# setting default download folder for chromedriver
# (do not forget to place the chromedriver in the working directory if not yet done)

chromeOptions = webdriver.ChromeOptions()
preferences = {"download.default_directory" : os.path.join(str(cwd),"/speeches")}
chromeOptions.add_experimental_option("prefs",preferences)

In [6]:
# connecting to browser

browser = webdriver.Chrome(executable_path = os.path.join(os.getcwd(), "chromedriver"), options=chromeOptions) 

In [7]:
# opening the https://millercenter.org/the-presidency/presidential-speeches website in chrome

browser.get("https://millercenter.org/the-presidency/presidential-speeches")

In [8]:
# This website is dynamic (as are the individual presidents sites). The further we scroll down 
# the more links to speeches are shown.

In [9]:
pause = 4 # had some interruptions with less than 3 seconds, with 3 and 4 s I made it to the bottom of the page

# Scrolling step by step automatically down, until the end of the page is reached
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom of the page
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(pause)

    # Calculate new scroll height and compare with previous scroll height
    new_height = browser.execute_script("return document.body.scrollHeight")
    
    if new_height == last_height:
        # Getting the last version of the html code of the site after scrolling finished
        html = browser.page_source
        soup = BeautifulSoup(html, 'html5lib')
        links = [link.get('href') for link in soup.find_all('a')]
        break
        
    last_height = new_height

In [10]:
# Getting all links out of the soup and deleting None's
links = [link.get('href') for link in soup.find_all('a')]
links = list(filter(None, links)) 

In [14]:
# Filtering for 'state-union-address' as part if the link and adding to stateoftheunion list

stateoftheunion = []

for l in links:
    if 'state-union-address' in l: 
        stateoftheunion.append("https://millercenter.org"+l)


In [15]:
stateoftheunion

['https://millercenter.org/the-presidency/presidential-speeches/february-5-2019-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-30-2018-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-12-2016-2016-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-20-2015-2015-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-28-2014-2014-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/february-13-2013-2013-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-24-2012-2012-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-25-2011-2011-state-union-address',
 'https://millercenter.org/the-presidency/presidential-speeches/january-27-2010-2010-state-union-address',
 'https://millercenter.org/the-presidency/pres

In [None]:
# Question: why are some years missing? e.g. 2001, 1993, 1981 ? First year of new presidency no sotu speech?

In [None]:
# Next step: crawling / looping over all links and getting the speech text, storing it with some meaningful name
# presidents name + date.txt?