In [1]:
import sys
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from tqdm import tqdm
import time

In [2]:
# First site to parse: https://www.americanrhetoric.com/speeches/

In [3]:
def list_to_file(python_list, path):
    with open(path, 'w') as f:
        f.write('\n'.join(python_list))
        
def remove_dupes_file(file_path, new_file_path):
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()
        unique_lines = set(lines)
    with open(new_file_path, 'w') as g:
        g.write('\n'.join(unique_lines))

In [38]:
def get_links(url):
    '''Extracts all links on a URL. Returns only speeches links
    (what we want) plus all links to make sure'''
    # Need 'User-Agent' to scrape americanrhetoric
    html_page = requests.get(url, headers={'User-Agent':'Mozilla/5.0'}).text
    soup = BeautifulSoup(html_page)
    links = []
    all_links = []
    for aref in soup.findAll('a'):
        link = aref.get('href')
        if link is not None and link[:9] == 'speeches/':
            links.append(link)
        all_links.append(link)
    print('{} contains {} total links and {} speech links'.format(url, len(soup.findAll('a')), len(links)))
    return links, all_links

In [49]:
# From the americanrethoric.com website
rethoric_links = []
sumall = []
sections = ['a-f', 'g-l', 'm-r', 's-z']
for section in sections:
    url = 'https://www.americanrhetoric.com/speechbank{}.htm'.format(section)
    links, all_links = get_links(url)
    print("{} links in section '{}'".format(len(links), url))
    _ = [rethoric_links.append(link) for link in links if link is not None]
    sumall.extend(all_links)
print("Total Non-Unique Speech Links: {}".format(len(rethoric_links)))

https://www.americanrhetoric.com/speechbanka-f.htm contains 679 total links and 448 speech links
448 links in section 'https://www.americanrhetoric.com/speechbanka-f.htm'
https://www.americanrhetoric.com/speechbankg-l.htm contains 565 total links and 432 speech links
432 links in section 'https://www.americanrhetoric.com/speechbankg-l.htm'
https://www.americanrhetoric.com/speechbankm-r.htm contains 569 total links and 421 speech links
421 links in section 'https://www.americanrhetoric.com/speechbankm-r.htm'
https://www.americanrhetoric.com/speechbanks-z.htm contains 516 total links and 255 speech links
255 links in section 'https://www.americanrhetoric.com/speechbanks-z.htm'
Total Non-Unique Speech Links: 1556


In [50]:
rethoric_links = sorted(list(set(rethoric_links)))
print("Total Unique Speech Links: {}".format(len(rethoric_links)))

Total Unique Speech Links: 1303


In [51]:
list_to_file(rethoric_links, "american_rethoric_links.txt")

In [52]:
rethoric_links

['speeches/1is2manycampusassaultpsa.htm',
 'speeches/9thearlspencerdianaeulogy.htm',
 'speeches/OsloII/hosnimubarakosloII.htm',
 'speeches/OsloII/husseinbintalalosloII.htm',
 'speeches/OsloII/mahmoudabbasoslo2.htm',
 'speeches/OsloII/shimonperesosloII.htm',
 'speeches/OsloII/warrenchristopherosloII.htm',
 'speeches/OsloII/wjclintonosloII.htm',
 'speeches/OsloII/yitzhakrabinosloII.htm',
 'speeches/PDFFiles/George%20W.%20Bush%20-%20Selected%20Speeches.pdf',
 'speeches/aaronswartzf2cconference.htm',
 'speeches/abbott&costellowhosonfirst.htm',
 'speeches/abrahamlincolncooperunionaddress.htm',
 'speeches/abrahamlincolnhousedivided.htm',
 'speeches/abrahamlincolninauguraladdress.htm',
 'speeches/abrahamlincolnmissouricompromiserepeal.htm',
 'speeches/abrahamlincolnsecondinauguraladdress.htm',
 'speeches/abrahamribicoff1968dnc.htm',
 'speeches/abrahamsinkovcryptologicfieldinterview.htm',
 'speeches/adamschiffcongressrussianactivemeasures.htm',
 'speeches/adamschiffyoumightthinkitsok.htm',
 's

In [53]:
remove_dupes_file('list_of_presidents.txt', 'unique_list_of_presidents.txt')

In [78]:
def extract_president_names(file_path):
    '''Gets individual names and last names as list from text file'''
    with open(file_path, 'r') as f:
        names = f.read().splitlines()
        names_unique = set(names)
        print("Total politician full names: {}. Unique: {}".format(len(names), len(names_unique)))
        split_names = []
        _ = [split_names.extend(name.split(' ')) for name in names_unique]
        president_names = [item for item in split_names if len(item) > 2]
        print("Total splitted names to search: {}".format(len(president_names)))
        return president_names
              

In [79]:
democratic_names = extract_president_names('democratic_politicians.txt')
republican_names = extract_president_names('republican_politicians.txt')

Total politician full names: 17. Unique: 16
Total splitted names to search: 16
Total politician full names: 14. Unique: 13
Total splitted names to search: 13


In [80]:
# Removin 3 letter names and shared names
democratic_names = [i for i in democratic_names if len(i) > 3]
republican_names = [i for i in republican_names if len(i) > 3]
democratic_names = list(set(democratic_names).difference(republican_names))
republican_names = list(set(republican_names).difference(democratic_names))

In [81]:
democratic_names

['Mondale',
 'Gore',
 'Biden',
 'McGovern',
 'Barack',
 'Johnson',
 'Lyndon',
 'Clinton',
 'Ford',
 'Kerry',
 'Carter',
 'Humphrey',
 'Obama',
 'Kennedy',
 'Dukakis',
 'Hillary']

In [83]:
def extract_matching_urls_amrethoric(politician_names, links):
    '''Extract links with politician name in them, from the list
    of links in the americanrethoric site. Returns a dictionary 
    with the names as keys, and a flat list with all links'''
    presidential_urls = dict()
    for name in politician_names:
        # Extract all links that aren't None and that match the lowercased name and that end in htm
        matching = [link for link in rethoric_links if link is not None and name.lower() in link.lower() and link[-4:]== '.htm']
        presidential_urls[name] = matching
    list_of_links = []
    _ = [list_of_links.extend(val) for val in presidential_urls.values()]
    unique_links = set(list_of_links)
    print("Unique links: {}, Matching links: {}, Total links: {}".format(
        len(unique_links), len(list_of_links), len(links)))
    return unique_links, list_of_links, presidential_urls

In [84]:
dem_links_list = extract_matching_urls_amrethoric(democratic_names, rethoric_links)[0]
rep_links_list = extract_matching_urls_amrethoric(republican_names, rethoric_links)[0]

Unique links: 132, Matching links: 155, Total links: 1303
Unique links: 137, Matching links: 138, Total links: 1303


In [85]:
list(dem_links_list)[:5]

['speeches/algorenobellecture.htm',
 'speeches/dalebumpersdefenseofclinton.htm',
 'speeches/ashcarternewamericancenter.htm',
 'speeches/johnkerrytranspacifictradepactboeing.htm',
 'speeches/convention2008/wjclinton2008dnc.htm']

In [86]:
# At this point we need to check manually the list of speeches to scrape,
# remove what's not a political speech or the right politician, and read again
list_to_file(dem_links_list, "democratic_speeches_list.txt")
list_to_file(rep_links_list, "republican_speeches_list.txt")

# Need to remove manually from file

In [87]:
def join_one_speech(speech_paragraphs):
    '''Extracts and processes the text from a single speech/page'''
    speech_joined = []
    for paragraph in speech_paragraphs:
        clean_paragraph = paragraph.get_text(strip=True).replace('\r\n\t\t\t', '')
        speech_joined.append(clean_paragraph)
    speech_joined = ' '.join(speech_joined)
    return speech_joined

In [92]:
def process_links_list(link_list):
    '''Takes a list of URLS with speeches and returns the clean speeches (from americanrethoric)'''
    root_url = 'https://www.americanrhetoric.com/'
    all_speeches = []
    for link in tqdm(link_list):
        url = root_url + link
        html_page = requests.get(url).text
        soup = BeautifulSoup(html_page, from_encoding='cp1252')
        paragraphs = soup.find_all('p', style="line-height: 130%")
        speech_clean = join_one_speech(paragraphs)
        if speech_clean:
            all_speeches.append(speech_clean)
    time.sleep(3)
    print("Finished list of links. {} speeches saved of {} links processed".format(
                    len(all_speeches), len(link_list)))
    return all_speeches

In [93]:
americanrethoric_democrat = process_links_list(dem_links_list)
americanrethoric_republican = process_links_list(rep_links_list)

100%|██████████| 132/132 [00:14<00:00,  9.29it/s]
  1%|          | 1/137 [00:00<00:19,  7.09it/s]

Finished list of links. 0 speeches saved of 132 links processed


100%|██████████| 137/137 [00:14<00:00,  9.35it/s]


Finished list of links. 0 speeches saved of 137 links processed


In [94]:
americanrethoric_democrat

[]

In [None]:
# Once we're inside the page, check which html tag has the title of the speech
# See if we can separate Autor from Title
# Then, classify politician as Democrat or Republican depending based on my list

In [22]:
print('Democratic speeches number of words: {:,}'.format(len('\n'.join(americanrethoric_democrat).split())))
print('Republican speeches number of words: {:,}'.format(len('\n'.join(americanrethoric_republican).split())))

Democratic speeches number of words: 280,763
Republican speeches number of words: 303,904


In [21]:
americanrethoric_democrat # x92, x93, x94 .decode('unciode-escape').encode('ascii','utf8', erros=Ignore)

['Thank you. Thank you. Thank you. Thank you, Dr. Falwell. Thank you, faculty, families and friends, and thank you Liberty University Class of 2006 for your welcome and your kind invitation to give this year\x92s commencement address. I want to join in the chorus of congratulations to the Class of 2006. This is a day to bask in praise. You\x92ve earned it. You have succeeded in a demanding course of instruction. Life seems full of promise as is always the case when a passage in life is marked by significant accomplishment. Today, it might seem as if the world attends you. But spare a moment for those who have truly attended you so well for so long, and whose pride in your accomplishments is even greater than your own -- your parents. When the world was looking elsewhere your parents\' attention was one of life\x92s certainties. So, as I commend you, I offer equal praise to your parents for the sacrifices they made for you, for their confidence in you and their love. More than any other

**NOTES**

- I need to check for the most important politicians (Obama, Trump...) as they seem to have a separate page on the americanrethoric site
- We should show a graph with how much data is from each individual person. Ex: is likely that there will be more Obama speeches than anybody eslse, so classifying as democrat will be skewed by that - almost like classified as Obama-like.

In [None]:
# It's a possibility, we can check and potentially build different models
# Create EDA graphs based on politician "weight"

**QUESTIONS**
1. Which format should I save the data in? DataFrame? Which structure - columns and index names?
2. Issue with encoding - seems to be not using the right encoding 'cp1252', but also seems like the apostrophes are escaped in the final string?
- Difference between bytes and string types in Python 3. 
- How to use encode and decode
3. How to calculate how much data we have? Number of speeches is misleading as they might be very different in length. Number of characters? Number of words?

In [None]:
# every row should be one speech
# df Columns = 'Speech', 'President', 'Affiliation'
# df Row = one single speech

In [None]:
# Build dataframe
# EDA: speeches per politician, per category > Is that enough?
# Next: prepocessing speeches > lower casing,  contraction expansion, removing characters, stopwords, lemmatization

In [None]:
# Would like to have 2 data groups to compare: recent speeches 
# (Trump, Obama, Biden, Hillary, etc) and historic speeches (Nixon, Kennedy, etc)

In [None]:
# Speeches types: Rally, addressing the nation, 