## Collecting new dataset (speeches by Donald Trump and Joe Biden)
### Method: Miller Center API

In [1]:
# Import necessary libraries
import pandas as pd
import requests

### Collecting Joe Biden's speech transcripts

In [2]:
# We start by making a POST request to the API endpoint. This returns the speech data in JSON format.
endpoint = "https://api.millercenter.org/speeches?filter_president=Joe+Biden"
r = requests.post(url=endpoint)
data = r.json()
items = data['Items']

# Next, we check if there is a 'LastEvaluatedKey' in the response data, indicating that there are more items to retrieve. If so, we continue making POST requests to the API with the 'continue_president' and 'continue_doc_name' parameters to fetch the remaining items.
while 'LastEvaluatedKey' in data:
    continue_president = data['LastEvaluatedKey']['president']
    continue_doc_name = data['LastEvaluatedKey']['doc_name']
    
    r = requests.post(url=endpoint, params={"continue_president": continue_president, "continue_doc_name": continue_doc_name})
    data = r.json()
    items += data['Items']

# For each speech item in the 'Items' list, we extract the title, president, date, and transcript. 
# We store this information in a list of dictionaries called `speeches`. 
speeches = []
for item in items:
    title = item['title']
    president = item['president']
    date = item['date']
    doc_name = item['doc_name']
    transcript = item['transcript']
    
    speech = {
        'Title': title,
        'President': president,
        'Date': date,
        'Transcript': transcript
    }
    speeches.append(speech)

# We create a pandas DataFrame using the `speeches` list. 
df = pd.DataFrame(speeches)

In [4]:
# To standardize the date in the data collected from API, 
# we use the `parse()` function from the dateutil library.
# It can be used to parse the date string and convert it into a datetime object. 

from dateutil.parser import parse

# Then, you can use the `strftime()` function to format the datetime object into the desired date format. 
#`%Y-%m-%d` represents the year, month, and day in a four-digit year format, two-digit month, and two-digit day.

df['Date'] = df['Date'].apply(lambda x: parse(x).strftime('%Y-%m-%d'))

In [6]:
# Finally, we save it to a CSV file.
df.to_csv('Biden_speeches.csv', index=False)

### Collecting Donald Trump's speech transcripts

In [7]:
endpoint = "https://api.millercenter.org/speeches?filter_president=Donald+Trump"
r = requests.post(url=endpoint)
data = r.json()
items = data['Items']

while 'LastEvaluatedKey' in data:
    continue_president = data['LastEvaluatedKey']['president']
    continue_doc_name = data['LastEvaluatedKey']['doc_name']
    
    r = requests.post(url=endpoint, params={"continue_president": continue_president, "continue_doc_name": continue_doc_name})
    data = r.json()
    items += data['Items']

speeches = []
for item in items:
    title = item['title']
    president = item['president']
    date = item['date']
    doc_name = item['doc_name']
    transcript = item['transcript']
    
    speech = {
        'Title': title,
        'President': president,
        'Date': date,
        'Transcript': transcript
    }
    speeches.append(speech)

df = pd.DataFrame(speeches)

In [9]:
df['Date'] = df['Date'].apply(lambda x: parse(x).strftime('%Y-%m-%d'))

In [10]:
df.to_csv('Trump_speeches.csv', index=False)

## Combining pre-existing dataset and new datasets

In [26]:
# load datasets 
data0 = pd.read_csv('presidential_speeches.csv')
data1 = pd.read_csv('Biden_speeches.csv')
data2 = pd.read_csv('Trump_speeches.csv')

In [27]:
# formating datasets
data1 = data1.rename(columns={'Title': 'Speech Title', 'Speech': 'Transcript'})
data2 = data2.rename(columns={'Title': 'Speech Title', 'Speech': 'Transcript'})

In [28]:
data1["Party"] = "Democratic"

In [29]:
data2["Party"] = "Republican"

In [30]:
data0 = data0.drop(columns='Summary')
data0 = data0.drop(columns='URL')

In [31]:
data0 = data0[data0['President'] != 'Donald Trump']

In [32]:
combined_data = pd.concat([data0, data1, data2], ignore_index=True)

In [34]:
combined_data = combined_data.drop_duplicates()

In [43]:
sorted_data = combined_data.sort_values('Date', ascending=False)

In [55]:
# cleaning the transcripts
import re
def clean_text(text):
    if pd.isnull(text):  # Check if value is NaN
        return ''  # Convert NaN to empty string
    else:
        # Remove newlines and carriage returns
        cleaned_text = text.replace('\n', '').replace('\r', '')
        
        # Remove multiple consecutive spaces
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
        
        return cleaned_text

sorted_data['Transcript'] = sorted_data['Transcript'].apply(clean_text)

In [57]:
# save CSV file (the original 
sorted_data.to_csv('combine_president_speeches.csv', index=False)

In [58]:
sorted_data

Unnamed: 0,Date,President,Party,Speech Title,Transcript
981,2024-01-05,Joe Biden,Democratic,"January 5, 2024: Speech on the Third Anniversa...","THE PRESIDENT: Thank you, thank you, thank you..."
992,2023-10-20,Joe Biden,Democratic,"October 20, 2023: Remarks on the US Response i...","Good evening, my fellow Americans. We’re facin..."
977,2023-02-22,Joe Biden,Democratic,"February 21, 2023: Remarks on the One-Year Ann...","THE PRESIDENT: Hello, Poland! One of our great..."
979,2023-02-08,Joe Biden,Democratic,"February 7, 2023: State of the Union Address",Mr. Speaker. Madam Vice President. Our First L...
994,2022-09-21,Joe Biden,Democratic,"September 21, 2022: Speech before the 77th Ses...","Thank you. Mr. President, Mr. Secretary-Genera..."
...,...,...,...,...,...
4,1790-12-29,George Washington,Unaffiliated,Talk to the Chiefs and Counselors of the Senec...,"I the President of the United States, by my ow..."
3,1790-12-08,George Washington,Unaffiliated,Second Annual Message to Congress,Fellow citizens of the Senate and House of Rep...
2,1790-01-08,George Washington,Unaffiliated,First Annual Message to Congress,Fellow Citizens of the Senate and House of Rep...
1,1789-10-03,George Washington,Unaffiliated,Thanksgiving Proclamation,Whereas it is the duty of all Nations to ackno...


## Grouping transcripts by presidents and creating the TXT files.

In [59]:
import os

grouped_data = df.groupby("President")

# Create the "president" folder if it doesn't exist
folder_name = "corpus"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

for president, group in grouped_data:
    # Concatenate the transcript values for each president
    concatenated_text = " ".join(group["Transcript"].values)
    
    # Write the concatenated text into a text file with the president's name
    file_name = f"{president}.txt"
    file_path = os.path.join(folder_name, file_name)
    with open(file_path, "w") as file:
        file.write(concatenated_text)