In [304]:
import sys
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from tqdm import tqdm
import time

In [None]:
# First site to parse: https://www.americanrhetoric.com/speeches/

In [142]:
def list_to_file(python_list, path):
    with open(path, 'w') as f:
        f.write('\n'.join(python_list))
        
def remove_dupes_file(file_path, new_file_path):
    with open(file_path, 'r') as f:
        lines = f.read().splitlines()
        unique_lines = set(lines)
    with open(new_file_path, 'w') as g:
        g.write('\n'.join(unique_lines))

In [263]:
def get_links(url):
    '''Extracts all links on a URL'''
    html_page = requests.get(url).text
    soup = BeautifulSoup(html_page)
    links = []
    for link in soup.findAll('a'):
        links.append(link.get('href'))
    return links

In [109]:
# From the americanrethoric.com website
rethoric_links = []
sections = ['a-f', 'g-l', 'm-r', 's-z']
for section in sections:
    url = 'https://www.americanrhetoric.com/speechbank{}.htm'.format(section)
    links = get_links(url)
    print("{} links in section '{}'".format(len(links), section))
    [rethoric_links.append(link) for link in links if link is not None]
print("Total links: {}".format(len(rethoric_links)))


669 links in section 'a-f'
554 links in section 'g-l'
560 links in section 'm-r'
509 links in section 's-z'
Total links: 2245


In [110]:
list_to_file(rethoric_links, "american_rethoric_links.txt")

In [143]:
remove_dupes_file('list_of_presidents.txt', 'unique_list_of_presidents.txt')

In [147]:
def extract_president_names(file_path):
    '''Gets individual names and last names as list from text file'''
    with open(file_path, 'r') as f:
        names = f.read().splitlines()
        names_unique = set(names)
        print("Total politician full names: {}. Unique: {}".format(len(names), len(names_unique)))
        split_names = []
        _ = [split_names.extend(name.split(' ')) for name in names_unique]
        president_names = [item for item in split_names if len(item) > 2]
        print("Total splitted names to search: {}".format(len(president_names)))
        return president_names
              

In [148]:
democratic_names = extract_president_names('democratic_politicians.txt')
republican_names = extract_president_names('republican_politicians.txt')

Total politician full names: 14. Unique: 14
Total splitted names to search: 27
Total politician full names: 14. Unique: 14
Total splitted names to search: 28


In [164]:
def extract_matching_urls_amrethoric(politician_names, links):
    '''Extract links with politician name in them, from the list
    of links in the americanrethoric site. Returns a dictionary 
    with the names as keys, and a flat list with all links'''
    presidential_urls = dict()
    for name in politician_names:
        # Extract all links that aren't None and that match the lowercased name and that end in htm
        matching = [link for link in rethoric_links if link is not None and name.lower() in link.lower() and link[-4:]== '.htm']
        presidential_urls[name] = matching
    list_of_links = []
    _ = [list_of_links.extend(val) for val in presidential_urls.values()]
    unique_links = set(list_of_links)
    print("Unique links: {}, Matching links: {}, Total links: {}".format(
        len(unique_links), len(list_of_links), len(links)))
    return unique_links, list_of_links, presidential_urls

In [165]:
dem_links_list = extract_matching_urls_amrethoric(democratic_names, rethoric_links)[0]
rep_links_list = extract_matching_urls_amrethoric(republican_names, rethoric_links)[0]

Unique links: 217, Matching links: 379, Total links: 2245
Unique links: 243, Matching links: 445, Total links: 2245


In [168]:
list(dem_links_list)[:5]

['speeches/johnmccainlibertycommencement.htm',
 'speeches/johnmccainmunichsecurityconference2017.htm',
 'speeches/philknightjoepaternomemorial.htm',
 'speeches/wjclintonmemphis.htm',
 'speeches/brianmulroneyeulogyforgeorgehwbush.htm']

In [286]:
def join_one_speech(speech_paragraphs):
    '''Extracts and processes the text from a single speech/page'''
    speech_joined = []
    for paragraph in speech_paragraphs:
        clean_paragraph = paragraph.get_text(strip=True).replace('\r\n\t\t\t', '')
        speech_joined.append(clean_paragraph)
    speech_joined = ' '.join(speech_joined)
    return speech_joined

In [311]:
def process_links_list(link_list):
    '''Takes a list of URLS with speeches and returns the clean speeches (from americanrethoric)'''
    root_url = 'https://www.americanrhetoric.com/'
    all_speeches = []
    for link in tqdm(link_list):
        url = root_url + link
        html_page = requests.get(url).text
        soup = BeautifulSoup(html_page, from_encoding='cp1252')
        paragraphs = soup.find_all('p', style="line-height: 130%")
        speech_clean = join_one_speech(paragraphs)
        if speech_clean:
            all_speeches.append(speech_clean)
    time.sleep(3)
    print("Finished list of links. {} speeches saved of {} links processed".format(
                    len(all_speeches), len(link_list)))
    return all_speeches

In [312]:
americanrethoric_democrat = process_links_list(dem_links_list)
americanrethoric_republican = process_links_list(rep_links_list)

100%|██████████| 217/217 [00:50<00:00,  4.31it/s]
  0%|          | 0/243 [00:00<?, ?it/s]

Finished list of links. 106 speeches saved of 217 links processed


100%|██████████| 243/243 [00:59<00:00,  4.07it/s]


Finished list of links. 121 speeches saved of 243 links processed


In [303]:
print('Democratic speeches number of words: {:,}'.format(len('\n'.join(americanrethoric_democrat).split())))
print('Democratic speeches number of words: {:,}'.format(len('\n'.join(americanrethoric_republican).split())))

Democratic speeches number of words: 280,763
Democratic speeches number of words: 303,904


**NOTES**

- We should show a graph with how much data is from each individual person. Ex: is likely that there will be more Obama speeches than anybody eslse, so classifying as democrat will be skewed by that - almost like classified as Obama-like.

**QUESTIONS**
1. Which format should I save the data in? DataFrame? Which structure - columns and index names?
2. Issue with encoding - seems to be not using the right encoding 'cp1252', but also seems like the apostrophes are escaped in the final string?
- Difference between bytes and string types in Python 3. 
- How to use encode and decode
3. How to calculate how much data we have? Number of speeches is misleading as they might be very different in length. Number of characters? Number of words?

In [None]:
# Below is attempt to parse the rev.com site. Will try further ahead.

In [None]:
sys.path.append('/Users/josemontoro/data/personal/springboard/repos/PySitemap')
import crawler

In [7]:
url = 'https://www.rev.com/blog/transcripts/'

crawl = crawler.Crawler(url)

In [None]:
links = crawl.start()

In [None]:
# Below is attempt to parse the Millcenter.org site. Will try further ahead.

president_ids = [39, 43, 8396, 40, 35, 42, 38, 34, 37,41]
presidents = ['Reagan', 'Obama', 'Trump', 'H.W.Bush', 'Lyndon', 'Bush', 'Carter', 'Kennedy', 'Ford', 'Clinton']

# URL structure = https://millercenter.org/the-presidency/presidential-speeches?field_president_target_id[43]=43


for president_id in president_ids:
    url = 'https://millercenter.org/the-presidency/presidential-speeches?field_president_target_id[{}]={}'.format(
    president_id, president_id)
    links = get_links(url)
    break
links