In [1]:
#---------------------------------
#
#           SETTINGS
#
#---------------------------------
s_date = "20 Jan 2009"
e_date = "20 Jan 2010"

#---------------------------------
import pandas as pd
import requests 
from bs4 import BeautifulSoup, element
import time
import urllib3
import re

#Disable warnings
urllib3.disable_warnings()

#Formate Date
def dateTime (d):
    return time.strptime(d, "%d %b %Y")

#Initialize
dateList = []
titleList = []
linkList = []
locationList = []
speechContent = []
count = 1 #ONLY FOR TRACING THE PROCESS

#Fetch Data
print("Loading the page...")
url = 'https://www.americanrhetoric.com/barackobamaspeeches.htm'
response = requests.get(url, verify=False)
print("Got the page, searching speeches...")
doc = BeautifulSoup(response.text)


#Find the date between 
dates = doc.find(text = "18 Jan 2009").find_all_next(bgcolor = '#E2EBFE')
for date in dates:
    dateStr = date.text.strip()
    #Date from Jan 18, 2009 to Jan 18, 2010
    if (dateTime(dateStr) >= dateTime(s_date) and dateTime(dateStr) <= dateTime(e_date)):
        dateList.append(dateStr)
        #Find the title and links
        title = date.next_sibling.next_sibling.find('a')
        titleList.append(title.text.strip())
        link = title.get('href')
        linkList.append(link)
print("Got the speeches' list, scraping the speech...")

#Fetch the context
for link in linkList:
    url = "https://www.americanrhetoric.com/" + link
    response = requests.get(url, verify=False)
    context = BeautifulSoup(response.text)
    
    location = context.find("table").find(size="1")
    location = location.text.replace("\r\n", "").strip()
    position = location.find(", ")
    #如果=0，则逗号在第一位，如果是-1，则没有逗号
    #两种情况，= -1 和 >=0 
    if (position != -1):
        location = location[position+2:len(location)]
    else:
        location = "N/A"
    locationList.append(location)
    
    #Clean the speech
    
    #页面数据包含几种情况：
    #1. 有 Class 为 entry-body 的 Div 包裹
    #2. 没有 div 包裹，内容完全在 td 里。
    
    #需要处理的问题：
    #1. 所有 align="center" 的 p 都需要被移除。
    #2. 所有 hr 标记后的内容需要被移除。
    #3. 移除所有图片。
    #4. 移除所有空的p标签。
    #5. 移除所有包含 PDF 图标链接的 p 部分。
    
    try:
        #有 Class="entry-body" 的 Div 包裹
        speech = context.find_all(class_="entry-body")
        if (len(speech) == 1):
            speech = context.find(class_="entry-body")
        else:
            speech = context.find(class_="entry-body").find_next(class_="entry-body")
        deletes = speech.find_all(align="center") 
    except AttributeError:
        try: 
            speech = context.find("td")
            deletes = speech.find_all(align="center")
        except:
            print("Couldn't find the speech in the page.")
            print('Something goes worng with fetching https://www.americanrhetoric.com/'+link)
            break
            
    #Delete useless parts
    for delete in deletes:
        delete.decompose()
    #decompose()是bs里的一个方法
    
    #Find <hr>, and remove
    try:
        hrs = speech.find("hr").find_all_next()
        for hr in hrs:
            try:
                hr.decompose()
            except element.NavigableString: 
                pass
        speech.find("hr").decompose()
    except:
        pass
            
    #Find <img>, and remove
    imgs = speech.find_all("img")
    try:
        for img in imgs:
            img.decompose()
    except:
        pass
    
    #Remove empty <p>
    ps = speech.find_all("p")
    try:
        for p in ps:
            if (len(p.text.strip()) == 0):
                #如果p行里没有text
                p.decompose()
            else:
                pass
    except:
        print("Couldn't remove <p> tags.")
        
    #Remove links included ".pdf"
    try:
        pdf = speech.find(href=re.compile(".pdf")).parent.decompose()
    except:
        pass
    
    #Format the speech
    speech = speech.find_all('p')
    
    content = ''
    #content += "\n".join([i.text.replace('\t', '') for i in speech])
    for i in speech:
        content += i.text
    content = content.replace('\r\n','').replace('\t\t','').replace('\t','')

    speechContent.append(content)
    
    print("{:.0%}".format(count/len(linkList))) #Output the process
    count += 1

print("Finished.")
fileName = input('\nPlease input your CSV file name...\n')

total = dict(zip(['Date', 
                  'Title', 
                  'Link', 
                  'Location', 
                  'Speech'], 
                 [dateList, 
                  titleList, 
                  linkList, 
                  locationList,
                  speechContent]))

#Output CSV
df = pd.DataFrame(total)
df.to_csv(fileName + "(" + s_date + " to " + e_date + ").csv", index=False)

print('DONE.')

Loading the page...
Got the page, searching speeches...
Got the speeches' list, scraping the speech...
Delivered 20 January 2009
2%
First Broadcast 24 January 2009, Washington, D.C.
Washington, D.C.
4%
delivered 26 January 2009, White House, Washington, D.C.
White House, Washington, D.C.
6%
delivered 4 February 2009, Grand Foyer, The White House
Grand Foyer, The White House
8%
delivered 9 February 2009
10%
delivered 12 February 2009, Lincoln Memorial, Washington, D.C.
Lincoln Memorial, Washington, D.C.
12%
delivered 12 February 2009, Crowne Plaza Hotel, Springfield, IL
Crowne Plaza Hotel, Springfield, IL
14%
delivered 24 February 2009
16%
delivered 27 February 2009, Camp Lejeune, North Carolina
Camp Lejeune, North Carolina
18%
delivered 10 March 2009
20%
delivered 2 April 2009, Excel Center, London, England
Excel Center, London, England
22%
delivered 3 April 2009, Rhenus Sports Arena
Rhenus Sports Arena
24%
delivered 5 April 2009, Czech Republic
Czech Republic
25%
delivered 6 April 200