In [10]:
%pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


Context: This script initializes the Natural Language Processing (NLP) workflow within the 02b_cleaning_speeches.ipynb notebook. Its primary objective is to construct a text corpus comprising the nomination acceptance speeches of Democratic and Republican presidential candidates from 2000 to 2024.

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# ==========================================
# STEP 1: INITIALIZATION OF SCRAPING TARGETS
# ==========================================
# Define a list of dictionaries containing metadata and target URLs for presidential nomination acceptance speeches.
# Source: The American Presidency Project (UCSB).
urls_to_scrape = [
    {"year": 2024, "party": "Democrat", "candidate": "Harris", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-democratic-presidential-nomination-chicago-illinois"},
    {"year": 2024, "party": "Republican", "candidate": "Trump", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-republican-national-convention-milwaukee"},
    {"year": 2020, "party": "Democrat", "candidate": "Biden", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-democratic-presidential-nomination-wilmington-delaware"},
    {"year": 2020, "party": "Republican", "candidate": "Trump", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-republican-presidential-nomination-2"},
    {"year": 2016, "party": "Democrat", "candidate": "Clinton", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-democratic-national-convention"},
    {"year": 2016, "party": "Republican", "candidate": "Trump", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-republican-national-convention-cleveland"},
    {"year": 2012, "party": "Democrat", "candidate": "Obama", "url": "https://www.presidency.ucsb.edu/documents/remarks-accepting-the-presidential-nomination-the-democratic-national-convention-charlotte"},
    {"year": 2012, "party": "Republican", "candidate": "Romney", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-republican-national-convention-tampa"},
    {"year": 2008, "party": "Democrat", "candidate": "Obama", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-democratic-national-convention-denver"},
    {"year": 2008, "party": "Republican", "candidate": "McCaine", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-republican-national-convention-saint"},
    {"year": 2004, "party": "Democrat", "candidate": "Kerry", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-democratic-national-convention-boston"},
    {"year": 2004, "party": "Republican", "candidate": "Bush", "url": "https://www.presidency.ucsb.edu/documents/remarks-accepting-the-presidential-nomination-the-republican-national-convention-new-york"},
    {"year": 2000, "party": "Democrat", "candidate": "Gore", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-democratic-national-convention-los"},
    {"year": 2000, "party": "Republican", "candidate": "Bush", "url": "https://www.presidency.ucsb.edu/documents/address-accepting-the-presidential-nomination-the-republican-national-convention-0"},
]

speech_data = []

# ==========================================
# STEP 2: WEB SCRAPING LOOP
# ==========================================
# Iterate through each candidate's URL to retrieve the raw HTML content.
for item in urls_to_scrape:
    print(f"Retrieving data for: {item['candidate']} ({item['year']})...")
    
    try:
        # Send an HTTP GET request to the target URL.
        response = requests.get(item['url'])
        
        # Validate the HTTP status code (200 OK) before proceeding.
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup.
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # DOM Traversal: Locate the specific 'div' container holding the speech transcript.
            # The class 'field-docs-content' is specific to the structure of The American Presidency Project website.
            content_div = soup.find('div', class_='field-docs-content')
            
            if content_div:
                # Text Extraction: Retrieve text content while stripping HTML tags and excess whitespace.
                text = content_div.get_text(separator=' ', strip=True)
                
                # Append the structured data to the list.
                speech_data.append({
                    "year": item['year'],
                    "party": item['party'],
                    "candidate": item['candidate'],
                    "text": text
                })
            else:
                print(f"ERROR: No text content found for {item['candidate']}")
        else:
            print(f"ERROR: Broken link or server error for {item['candidate']}")
            
        # Rate Limiting: Pause execution for 1 second to respect the server's request policies.
        time.sleep(1)
        
    except Exception as e:
        print(f"Critical Error: {e}")

# ==========================================
# STEP 3: DATAFRAME CREATION
# ==========================================
# Convert the list of dictionaries into a pandas DataFrame for analysis.
df_speeches = pd.DataFrame(speech_data)

# Validation: Display the first few rows to verify data integrity.
print("Scraping complete.")
df_speeches.head()

Retrieving data for: Harris (2024)...
Retrieving data for: Trump (2024)...
Retrieving data for: Biden (2020)...
Retrieving data for: Trump (2020)...
Retrieving data for: Clinton (2016)...
Retrieving data for: Trump (2016)...
Retrieving data for: Obama (2012)...
Retrieving data for: Romney (2012)...
Retrieving data for: Obama (2008)...
Retrieving data for: McCaine (2008)...
Retrieving data for: Kerry (2004)...
Retrieving data for: Bush (2004)...
Retrieving data for: Gore (2000)...
Retrieving data for: Bush (2000)...
Scraping complete.


Unnamed: 0,year,party,candidate,text
0,2024,Democrat,Harris,The Vice President: Good evening! [ Laughs. ] ...
1,2024,Republican,Trump,"Thank you very much. Thank you very, very much..."
2,2020,Democrat,Biden,"Good evening. Ella Baker, a giant of the civil..."
3,2020,Republican,Trump,Thank you very much. Thank you very much. Than...
4,2016,Democrat,Clinton,"Thank you all very, very much! Thank you for t..."


Context: This segment of the 02b_cleaning_speeches.ipynb notebook implements the text preprocessing phase, which is fundamental to ensuring the validity of subsequent Natural Language Processing (NLP) tasks.

In [12]:
import re

def super_cleaning(text):
    """
    Performs rigorous text preprocessing to isolate the spoken content 
    from transcript metadata and non-verbal annotations.
    """
    # 1. Remove non-verbal annotations enclosed in square brackets 
    # (e.g., [Applause], [Laughter]) using regex.
    text = re.sub(r'\[.*?\]', '', text)
    
    # 2. Remove content enclosed in parentheses.
    # This often includes editor notes or additional non-verbal cues.
    text = re.sub(r'\(.*?\)', '', text)
    
    # 3. Remove speaker attribution prefixes if present at the start.
    # Checks the first 50 characters for a colon (e.g., "The Vice President: ...").
    if ":" in text[:50]: 
        text = text.split(":", 1)[1]
    
    # 4. Whitespace Normalization:
    # Replace newline characters with spaces to ensure continuity.
    text = text.replace('\n', ' ')
    # Collapse multiple whitespace characters into a single space and trim leading/trailing spaces.
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply the cleaning function to the raw text column to generate a standardized dataset.
df_speeches['clean_text'] = df_speeches['text'].apply(super_cleaning)

# Validation: Compare a sample of the raw vs. cleaned text (first 150 characters).
print("RAW   :", df_speeches['text'].iloc[0][:150])
print("CLEAN :", df_speeches['clean_text'].iloc[0][:150])

# Export the processed corpus to a CSV file for NLP analysis.
df_speeches.to_csv("president_speeches_clean.csv", index=False)
print("\n‚úÖ File 'president_speeches_clean.csv' successfully saved!")

RAW   : The Vice President: Good evening! [ Laughs. ] [ Applause. ] Audience: Kamala! Kamala! Kamala! The Vice President: California. [ Laughs. ] [ Applause. 
CLEAN : Good evening! Audience: Kamala! Kamala! Kamala! The Vice President: California. Good evening, everyone. Good evening. Good evening. Oh, my goodness. G

‚úÖ File 'president_speeches_clean.csv' successfully saved!


Context: This code block represents a refinement stage within 02b_cleaning_speeches.ipynb. Having identified that the initial cleaning pass left residual artifacts (such as "Audience:" or moderator interventions), this script applies a more rigorous cleaning protocol to the president_speeches_clean.csv dataset.

In [14]:
import pandas as pd
import re
from pathlib import Path
# ==========================================
# STEP 1: LOAD INTERMEDIATE DATA
# ==========================================

# 1. Configuration des chemins
current_dir = Path.cwd()
PROJECT_ROOT = current_dir.parent
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'

# 2. D√©finir le chemin exact du fichier √† charger
input_filename = "president_speeches_clean.csv"
load_path = PROCESSED_DIR / input_filename

# 3. Charger le dataset
# On v√©rifie d'abord que le fichier existe pour √©viter le crash brutal
if load_path.exists():
    df = pd.read_csv(load_path)
    print(f"‚úÖ Fichier charg√© depuis : {load_path}")
    print(f"Dimensions : {df.shape}")
else:
    print(f"‚ùå ERREUR : Le fichier est introuvable ici : {load_path}")
    print("Avez-vous bien ex√©cut√© l'√©tape d'export pr√©c√©dente ?")

# ==========================================
# STEP 2: DEFINE ADVANCED CLEANING FUNCTION
# ==========================================
def ultimate_cleaning(text):
    """
    Performs a secondary, granular cleaning pass to remove specific speaker 
    attributions and residual transcriptional artifacts.
    """
    # Validation: Ensure the input is a string; otherwise return an empty string.
    if not isinstance(text, str): return ""
    
    # Remove content enclosed in brackets [] and parentheses ().
    # This targets non-verbal annotations such as applause or laughter.
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    
    # Remove specific speaker prefixes (e.g., "Audience:", "The Vice President:").
    # This regex removes phrases resembling "Title:" at the beginning of a segment
    # to ensure the text strictly reflects the candidate's speech.
    text = re.sub(r'\b(Audience|The Vice President|The President|Hon\.|Mr\.|Ms\.|Mrs\.)\s*:', '', text, flags=re.IGNORECASE)
    
    # Standardize whitespace:
    # Replace newlines with spaces and collapse multiple spaces into a single instance.
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# ==========================================
# STEP 3: APPLY CLEANING
# ==========================================
print("Cleaning process initiated...")
df['clean_text'] = df['text'].apply(ultimate_cleaning)

# ==========================================
# STEP 4: FEATURE SELECTION
# ==========================================
# Select only the relevant metadata and the cleaned text column.
# The raw text is discarded to optimize the dataset for NLP analysis.
df_final = df[['year', 'party', 'candidate', 'clean_text']].copy()

# Rename 'clean_text' to 'text' to standardize the column schema.
df_final.rename(columns={'clean_text': 'text'}, inplace=True)


# ==========================================
# STEP 5: SAVE FINAL DATASET (Speeches)
# ==========================================

# 1. Configuration des chemins
current_dir = Path.cwd()
PROJECT_ROOT = current_dir.parent
DATA_DIR = PROJECT_ROOT / 'data'
PROCESSED_DIR = DATA_DIR / 'processed'

# 2. S√©curit√© : Cr√©ation du dossier s'il n'existe pas
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# 3. D√©finition du chemin complet
output_filename = "president_speeches_clean.csv"
save_path_speeches = PROCESSED_DIR / output_filename



print(f"File '{output_filename}' successfully updated.")
print(f"üìç Saved to: {save_path_speeches}")
print("The dataset now contains only the cleaned text version.")

print("\nPreview (first 100 characters):")
# J'ai gard√© votre slice [:100] qui est plus informative que 50
print(df_final['text'].iloc[0][:100])

‚úÖ Fichier charg√© depuis : /Users/jessicabourdouxhe/Desktop/Master 1/Data/Projet /elections-nlp-project/data/processed/president_speeches_clean.csv
Dimensions : (14, 4)
Cleaning process initiated...
File 'president_speeches_clean.csv' successfully updated.
üìç Saved to: /Users/jessicabourdouxhe/Desktop/Master 1/Data/Projet /elections-nlp-project/data/processed/president_speeches_clean.csv
The dataset now contains only the cleaned text version.

Preview (first 100 characters):
Good evening! Kamala! Kamala! Kamala! California. Good evening, everyone. Good evening. Good evening
