In [62]:
from requests import get
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [63]:

url = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union"



# Find the first instance of a table on the page (this will simplify work for us)
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

# Create an empty list to keep all the links
links = []

# Look through each row of the table to identify if there are links,
# if there are any, then get the first link, if there aren't any, just skip that row

#trs = table.findAll('tr')  
#for tr in trs:
#    try:
#        link = tr.find('a')['href'] # Finds the first link in a row
#        links.append(link) # Appends that link to the links list.
#    except:
#        pass



In [64]:
# We can also find all instances of the links on a page and that's what the code below does
# You can use the code below but I want to solve a more general version of the homework, 
# so I will use this code here:


for tr in trs:
    try:
        aas = tr.find_all('a') # Finds the first link in a row
        for a in aas:
            link = a['href']
            links.append(link) # Appends that link to the links list.
    except:
        pass

links

['https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-3',
 'https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-the-state-the-union-28',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=123408',
 'https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-the-state-the-union-25',
 'https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-the-state-the-union-26',
 'https://www.presidency.ucsb.edu/documents/address-before-joint-session-the-congress-the-state-the-union-27',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=102826',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=104596',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=108031',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=111174',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=85753',
 'https://www.presidency.ucsb.edu/ws/index.php?pid=87433',
 'https://www.presidency.ucsb.edu/ws/index.php?p

In [65]:
# Note that we had to use the try: and except:,
# because otherwise the code would result in an error as some rows don't have any links    
# Uncomment the code below to see this for yourself.

#for tr in trs:
#    link = tr.find('a')['href'] # Finds the first link in a row
#    links.append(link) # Appends that link to the links list.


In [66]:
# Links now has all the links we needed. I can save them to an excel file 
# just in case I need them

pd.DataFrame(links).to_excel("links.xlsx")


In [None]:
# THIS CHUNK WILL TAKE A LONG TIME TO RUN BECAUSE WE ARE SCRAPING MANY PAGES. 
# GET IT RUNNING AND GO GET SOME COFFEE WITH CHOCOLATE CHIP COOKIES
# DID I EVER MENTION THE WAY TO BE MY FRIEND IS TO BRING ME CHOCOLATE CHIP COOKIES?


# We can now go through individual links and extract the information we want.
# If we examine the pages of each speech, we will notice each speech is stored inside
# the following tag "<div class="field-docs-content"> </div>". 
# That's what we can grab.

# We can also grab the date stored inside the following tag:
#  <div class="field-docs-start-date-time"></div>

# Finally, to make our life even more easy, let's grab the name of the President
# It's in the tag "<div class="field-title"></div>"

# I will also grab the title of the speech just in case I need to use it since the titles differed somewhat


# We start with creating empty containers

names = []
dates = []
speeches = []
titles = [] 

for link in links:
    try:
        response = get(link)
    except: # I need this because Nixon's speech in 1973 in done in an annoying way, it's a footnote actually
        pass # so the code without it would result in an error
    soup = BeautifulSoup(response.text, 'html.parser')
    name =  soup.find("div", class_ ="field-title").get_text(strip=True)
    date = soup.find("div", class_ ="field-docs-start-date-time").get_text(strip=True)
    title = soup.find("div", class_ ="field-ds-doc-title").get_text(strip=True)
    speech = soup.find("div", class_ ="field-docs-content").get_text(strip=True)
    names.append(name)
    dates.append(date)
    titles.append(title)
    speeches.append(speech)

    
    

In [None]:
df = pd.DataFrame({'name': names, 'date': dates, 'title': titles, "speech": speeches})
display(df)

In [None]:
df.to_excel("speeches.xlsx")
