In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import requests 
from bs4 import BeautifulSoup as soup

header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'} 


# Getting The Dataframe Setup

In [2]:
url = 'https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/inaugural-addresses?items_per_page=60'
raw = requests.get(url,headers=header)


In [3]:
bsObj = soup(raw.content,'html.parser') # Parse the html
links = bsObj.find_all('div', class_="field-title") 
names = bsObj.find_all('div', class_="col-sm-4 margin-top")




In [5]:
president_names = [name.p.a.text for name in names]

speech_links = [title.find('a')['href'] for title in links]



In [6]:
# add second list
url = 'https://www.presidency.ucsb.edu/documents/app-categories/spoken-addresses-and-remarks/presidential/inaugural-addresses?items_per_page=60&page=1'
raw = requests.get(url,headers=header)

In [7]:
bsObj = soup(raw.content,'html.parser') 
links = bsObj.find_all('div', class_="field-title") 
names = bsObj.find_all('div', class_="col-sm-4 margin-top")




In [8]:
president_names.extend([name.p.a.text for name in names])

speech_links.extend([title.find('a')['href'] for title in links])


In [9]:
data = {'Name': president_names,
        'Link': speech_links}

# Create DataFrame
df = pd.DataFrame(data)

df


Unnamed: 0,Name,Link
0,Donald J. Trump (2nd Term),/documents/inaugural-address-54
1,"Joseph R. Biden, Jr.",/documents/inaugural-address-53
2,Donald J. Trump (1st Term),/documents/inaugural-address-14
3,Barack Obama,/documents/inaugural-address-15
4,Barack Obama,/documents/inaugural-address-5
...,...,...
58,Thomas Jefferson,/documents/inaugural-address-20
59,Thomas Jefferson,/documents/inaugural-address-19
60,John Adams,/documents/inaugural-address-18
61,George Washington,/documents/inaugural-address-17


# Each President Speech

In [10]:
base_url = 'https://www.presidency.ucsb.edu/'


In [13]:
# Method to search

unifying_words = ["together", "unity", "united", "one nation", "one people", "common good", "shared purpose", "cooperation", "reconciliation", "harmony", "bridge divides", "come together", "join hands", "heal", "bond", "mutual respect", "compromise", "bipartisan", "consensus", "all americans", "fellow citizens", "national spirit", "collective effort", "civic spirit", "shared prosperity", "common destiny", "national renewal", "rebuilding trust", "strength in diversity", "enduring values", "our shared history", "bridging differences", "fostering dialogue", "shared responsibility", "working side by side", "democratic principles", "collective resilience", "moral courage", "upholding our traditions", "healing divisions", "inclusive leadership", "neighborly love", "faith in each other", "building for future generations", "honoring our past", "a more perfect union", "promise of America", "shared sacrifice", "rising together", "responsibility to one another", "mutual understanding", "forging a new path", "hand in hand", "strengthening our democracy", "joining forces", "seeking harmony", "one future", "unity of purpose", "guiding principles", "national stability", "rekindling hope"]

polarizing_words = ["silent majority", "real americans", "true patriots", "taking back", "saving", "making America great again", "ideological battle", "radical", "corrupt", "enemy", "betrayal", "stolen", "attack", "disgrace", "destroy", "defeat", "overthrow", "rigged", "illegitimate", "danger", "threat", "invasion", "catastrophe", "collapse", "disaster", "crime wave", "carnage", "poisoned", "crisis", "war on", "taking away", "crushing", "oppressors", "false narratives", "treasonous", "swamp", "deep state", "elite class", "failed policies", "tyranny", "deception", "conspiracy", "subversion", "internal sabotage", "recklessness", "failure", "incompetence", "betrayal of trust", "disgraceful actions", "puppet masters", "dark forces", "the great betrayal", "traitorous", "anti-American", "illegitimate rulers", "selling out our country", "globalist agenda", "failed leadership", "the great deception", "two-faced politicians", "hypocrites", "reckoning day", "stolen future", "collapsing system", "deep-rooted rot", "false leaders", "bureaucratic swamp", "anti-democratic forces", "dismantling our freedoms", "forced submission", "creeping tyranny", "puppet class", "those who seek to control us", "abandoned values", "destabilization", "surrendering sovereignty", "political fraud", "rotten to the core", "weak leadership", "orchestrated chaos", "national decline", "ceding our power", "hidden agenda", "unwavering resolve", "history is on our side", "holding the line", "a turning point", "staying vigilant", "rising tide", "the people's mandate", "course correction", "true defenders", "restoring what was lost", "holding power accountable", "demanding justice", "aliens", "criminals", "rapists"]


def get_unifying_words(text):
    total = 0
    for phrase in unifying_words:
        count = text.count(phrase)
        total += count
            
    return total

def get_polarizing_words(text):
    total = 0
    for phrase in polarizing_words:
        count = text.count(phrase)
        total += count
            
    return total

def get_total_word_count(text):
    total = 0
    words = text.strip().split(' ') # Split the text into words
    #print(words)
    for word in words:
        if word != '':
            total += 1
    return total



In [14]:
# This takes several seconds to run 
unifying_words_count = []
polarizing_words_count = []
total_word_count = []

for end in df['Link'].values.tolist():
    url = base_url + end
    # print(url)
    raw = requests.get(url,headers=header)
    bsObj = soup(raw.content,'html.parser') 
    
    speech = bsObj.find_all(class_="field-docs-content") 
    
    #replace all new lines and get rid of empty spaces and make sure every thing is lower case before putting it into a string
    speech_text = ("".join([s.get_text().replace('\n', ' ') for s in speech])).lower() 
    speech_text = speech_text.replace("(applause.)", "") # gets rid of applause pauses
    unifying_words_count.append(get_unifying_words(speech_text))
    polarizing_words_count.append(get_polarizing_words(speech_text))
    total_word_count.append(get_total_word_count(speech_text))
    #print(speech_text)
    #print(get_unifying_words(speech_text))
    #print(get_polarizing_words(speech_text))
    #df

In [15]:
df['Unifying Words Count'] = unifying_words_count 
df['Polarizing Words Count'] = polarizing_words_count 
df['Total Words Count']  = total_word_count 


In [16]:
df

Unnamed: 0,Name,Link,Unifying Words Count,Polarizing Words Count,Total Words Count
0,Donald J. Trump (2nd Term),/documents/inaugural-address-54,20,23,2905
1,"Joseph R. Biden, Jr.",/documents/inaugural-address-53,31,6,2532
2,Donald J. Trump (1st Term),/documents/inaugural-address-14,12,4,1455
3,Barack Obama,/documents/inaugural-address-15,15,5,2090
4,Barack Obama,/documents/inaugural-address-5,13,15,2391
...,...,...,...,...,...
58,Thomas Jefferson,/documents/inaugural-address-20,4,3,2157
59,Thomas Jefferson,/documents/inaugural-address-19,2,1,1717
60,John Adams,/documents/inaugural-address-18,4,7,2319
61,George Washington,/documents/inaugural-address-17,2,0,135


# Adding Additional Columns 

### Adding Political Party Affiliation

In [17]:
# Create dictionary with each president's party
party_mapping = {
    'Donald J. Trump (2nd Term)': 'Republican',
    'Joseph R. Biden, Jr.': 'Democrat',
    'Donald J. Trump (1st Term)': 'Republican',
    'Barack Obama': 'Democrat',
    'George W. Bush': 'Republican',
    'William J. Clinton': 'Democrat',
    'George Bush': 'Republican',
    'Ronald Reagan': 'Republican',
    'Jimmy Carter': 'Democrat',
    'Richard Nixon': 'Republican',
    'Lyndon B. Johnson': 'Democrat',
    'John F. Kennedy': 'Democrat',
    'Dwight D. Eisenhower': 'Republican',
    'Harry S Truman': 'Democrat',
    'Franklin D. Roosevelt': 'Democrat',
    'Herbert Hoover': 'Republican',
    'Calvin Coolidge': 'Republican',
    'Warren G. Harding': 'Republican',
    'Woodrow Wilson': 'Democrat',
    'William Howard Taft': 'Republican',
    'Theodore Roosevelt': 'Republican',
    'William McKinley': 'Republican',
    'Grover Cleveland': 'Democrat',
    'Benjamin Harrison': 'Republican',
    'Chester A. Arthur': 'Republican',
    'James A. Garfield': 'Republican',
    'Rutherford B. Hayes': 'Republican',
    'Ulysses S. Grant': 'Republican',
    'Andrew Johnson': 'Democrat (Union)',
    'Abraham Lincoln': 'Republican',
    'James Buchanan': 'Democrat',
    'Franklin Pierce': 'Democrat',
    'Zachary Taylor': 'Whig',
    'James K. Polk': 'Democrat',
    'John Tyler': 'Whig (later Unaffiliated)',
    'William Henry Harrison': 'Whig',
    'Martin van Buren': 'Democrat',
    'Andrew Jackson': 'Democrat',
    'John Quincy Adams': 'Democratic-Republican',
    'James Monroe': 'Democratic-Republican',
    'James Madison': 'Democratic-Republican',
    'Thomas Jefferson': 'Democratic-Republican',
    'John Adams': 'Federalist',
    'George Washington': 'No formal party'
}

# Map political party onto president 
df["Political Party"] = df["Name"].map(party_mapping)
df.head(3)

Unnamed: 0,Name,Link,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party
0,Donald J. Trump (2nd Term),/documents/inaugural-address-54,20,23,2905,Republican
1,"Joseph R. Biden, Jr.",/documents/inaugural-address-53,31,6,2532,Democrat
2,Donald J. Trump (1st Term),/documents/inaugural-address-14,12,4,1455,Republican


### Combining "Democrat" and "Democrat (Union)" values in Political Party 
These parties are essentially the same and combining them will simplify our analysis.

In [18]:
df["Political Party"] = df["Political Party"].replace("Democrat (Union)", "Democrat")

### Adding Overall Word Count 

In [19]:
df["Overall Language"] = df["Unifying Words Count"] > df["Polarizing Words Count"]
df["Overall Language"] = df["Overall Language"].replace({True: "unifying", False: "polarizing"})


df.head(3)


Unnamed: 0,Name,Link,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language
0,Donald J. Trump (2nd Term),/documents/inaugural-address-54,20,23,2905,Republican,polarizing
1,"Joseph R. Biden, Jr.",/documents/inaugural-address-53,31,6,2532,Democrat,unifying
2,Donald J. Trump (1st Term),/documents/inaugural-address-14,12,4,1455,Republican,unifying


### Adding Unifying/Polarizing Word Count Ratios

In [20]:
# Calculate ratios
df["Unifying Words Ratio"] = round(df["Unifying Words Count"] / df["Total Words Count"], 5)
df["Polarizing Words Ratio"] = round(df["Polarizing Words Count"] / df["Total Words Count"], 5)
df.head(3)

Unnamed: 0,Name,Link,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language,Unifying Words Ratio,Polarizing Words Ratio
0,Donald J. Trump (2nd Term),/documents/inaugural-address-54,20,23,2905,Republican,polarizing,0.00688,0.00792
1,"Joseph R. Biden, Jr.",/documents/inaugural-address-53,31,6,2532,Democrat,unifying,0.01224,0.00237
2,Donald J. Trump (1st Term),/documents/inaugural-address-14,12,4,1455,Republican,unifying,0.00825,0.00275


### Adding Overall Ratio 

Overall Ratio = Unifying Words Ratio / Polarizing Words Ratio 
So, a higher overall ratio means the president used more unifying language. 
Specifically, overall ratio > 1 means more unifying than polarizing language, and overall ratio <= 1 means more polarizing than unifying language. 
We calculate the overall ratio from the unifying and polarizing ratios rather than just dividing unifying or polarizing words by total words for each observation because this accounts for some presidents using more words in their speeches than others. 

In [22]:
df["Overall Ratio"] = round(df["Unifying Words Ratio"] / df["Polarizing Words Ratio"], 4)
df.head(3)


Unnamed: 0,Name,Link,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language,Unifying Words Ratio,Polarizing Words Ratio,Overall Ratio
0,Donald J. Trump (2nd Term),/documents/inaugural-address-54,20,23,2905,Republican,polarizing,0.00688,0.00792,0.8687
1,"Joseph R. Biden, Jr.",/documents/inaugural-address-53,31,6,2532,Democrat,unifying,0.01224,0.00237,5.1646
2,Donald J. Trump (1st Term),/documents/inaugural-address-14,12,4,1455,Republican,unifying,0.00825,0.00275,3.0


# Check for Missing Values

In [26]:
# Check for missing values 
df.isna().sum()

Name                      0
Link                      0
Unifying Words Count      0
Polarizing Words Count    0
Total Words Count         0
Political Party           0
Overall Language          0
Unifying Words Ratio      0
Polarizing Words Ratio    0
Overall Ratio             1
dtype: int64

In [25]:
# View row with missing value
missing_row = df["Overall Ratio"].isna()
df[missing_row]

Unnamed: 0,Name,Link,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language,Unifying Words Ratio,Polarizing Words Ratio,Overall Ratio
41,Andrew Johnson,/documents/address-upon-assuming-the-office-pr...,0,0,360,Democrat,polarizing,0.0,0.0,


In [27]:
# Overall Ratio is missing for this row because no polarizing or unifying words were found, so we'll replace this with 0. 
df["Overall Ratio"] = df["Overall Ratio"].fillna(0)

In [28]:
# Check that there are no more missing values
df.isna().sum()

Name                      0
Link                      0
Unifying Words Count      0
Polarizing Words Count    0
Total Words Count         0
Political Party           0
Overall Language          0
Unifying Words Ratio      0
Polarizing Words Ratio    0
Overall Ratio             0
dtype: int64

# Drop Columns 

In [29]:
# Drop Link column
df.drop(["Link"], axis=1, inplace=True)
df.head(3)

Unnamed: 0,Name,Unifying Words Count,Polarizing Words Count,Total Words Count,Political Party,Overall Language,Unifying Words Ratio,Polarizing Words Ratio,Overall Ratio
0,Donald J. Trump (2nd Term),20,23,2905,Republican,polarizing,0.00688,0.00792,0.8687
1,"Joseph R. Biden, Jr.",31,6,2532,Democrat,unifying,0.01224,0.00237,5.1646
2,Donald J. Trump (1st Term),12,4,1455,Republican,unifying,0.00825,0.00275,3.0


# Export Data 

In [30]:
df.to_csv("./speech_data.csv", index=False)