In [None]:
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm  

SAVE_PATH= "fed_speeches.csv"

if os.path.exists(SAVE_PATH):
    all_years = pd.read_csv(SAVE_PATH)
    print("Fichier existant trouv√©, reprise du scraping...")
else:
    all_years = pd.DataFrame(columns=['date', 'title', 'speaker', 'text'])

# Ann√©es √† scraper
years = range(2020, 2025)
dataframes = []

def get_speech_text(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        content_divs = soup.select("div.col-xs-12.col-sm-8.col-md-8")

        if not content_divs:
            return "N/A"

        for div in content_divs:
            if "heading" not in div.get("class", []): 
                paragraphs = div.find_all("p")
                text = "\n".join([p.get_text(strip=True) for p in paragraphs])
                return text if text else "N/A"

    except requests.RequestException as e:
        print(f"Erreur r√©seau pour {url}: {e}")
    except Exception as e:
        print(f"Erreur lors de l'extraction du texte pour {url}: {e}")

    return "N/A"

for year in tqdm(years, desc="Scraping des discours"):
    try:
        url = f'https://www.federalreserve.gov/newsevents/speech/{year}-speeches.htm'
        print(f"üì° Scraping de {url}")

        response = requests.get(url, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        events = soup.select(".eventlist__event")
        print(f"üìä Nombre de discours trouv√©s en {year} : {len(events)}")

        data = []
        for speech in events:
            try:
                link = speech.find('a', href=True)

                # V√©rification du lien
                if not link:
                    print("Pas de lien trouv√©, passage au discours suivant.")
                    continue

                speech_page = 'https://www.federalreserve.gov' + link['href']
                print(f"üîó Lien trouv√© : {speech_page}") 
                
                response_speech = requests.get(speech_page, timeout=10)
                response_speech.raise_for_status()

                soup_speech = BeautifulSoup(response_speech.text, 'html.parser')

                # Extraction des √©l√©ments
                date = soup_speech.select_one("p.article__time")
                title = soup_speech.select_one("h3.title")
                speaker = soup_speech.select_one("p.speaker")

                date = date.get_text(strip=True) if date else "N/A"
                title = title.get_text(strip=True) if title else "N/A"
                speaker = speaker.get_text(strip=True) if speaker else "N/A"

                speech_text = get_speech_text(speech_page)

                data.append({
                    'date': date,
                    'title': title,
                    'speaker': speaker,
                    'text': speech_text
                })
            except Exception as e:
                print(f"Erreur d'extraction d'un discours en {year}: {e}")

        if data:
            speeches_one_year = pd.DataFrame(data)
            dataframes.append(speeches_one_year)

        time.sleep(2)

    except requests.RequestException as e:
        print(f"Erreur r√©seau pour {year}: {e}")
    except Exception as e:
        print(f"Erreur g√©n√©rale pour {year}: {e}")

# Concat√©nation et suppression des doublons
if dataframes:
    all_years = pd.concat([all_years] + dataframes, ignore_index=True).drop_duplicates()

# Sauvegarde dans un fichier CSV
all_years.to_csv(SAVE_PATH, index=False)
print("‚úÖ Scraping termin√© et sauvegard√© !")

print(all_years.head())

Scraping des discours:   0%|          | 0/5 [00:00<?, ?it/s]

üì° Scraping de https://www.federalreserve.gov/newsevents/speech/2020-speeches.htm
üìä Nombre de discours trouv√©s en 2020 : 53
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/brainard20201218a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/brainard20201217a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/quarles20201211a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20201204a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/brainard20201201a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20201119a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/brainard20201117a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/clarida20201116a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/brainard20201110a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsev

Scraping des discours:  20%|‚ñà‚ñà        | 1/5 [00:58<03:52, 58.18s/it]

üì° Scraping de https://www.federalreserve.gov/newsevents/speech/2021-speeches.htm
üìä Nombre de discours trouv√©s en 2021 : 68
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20211217a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/quarles20211202a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/clarida20211130a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20211129a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/powell20211129a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/clarida20211119a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20211119a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20211117a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/powell20211109a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/spee

Scraping des discours:  40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [02:17<03:31, 70.38s/it]

üì° Scraping de https://www.federalreserve.gov/newsevents/speech/2022-speeches.htm
üìä Nombre de discours trouv√©s en 2022 : 46
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/barr20221201a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/powell20221130a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/cook20221130a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/brainard20221128a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/jefferson20221117a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20221117a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20221116a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20221020a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20221014a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech

Scraping des discours:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [02:43<01:40, 50.46s/it]

üì° Scraping de https://www.federalreserve.gov/newsevents/speech/2023-speeches.htm
üìä Nombre de discours trouv√©s en 2023 : 95
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20231205a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/powell20231201a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/barr20231201a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/barr20231128a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20231128a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20231128a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/cook20231116a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/barr20231116a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/jefferson20231114a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/cook2

Scraping des discours:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [03:38<00:52, 52.30s/it]

üì° Scraping de https://www.federalreserve.gov/newsevents/speech/2024-speeches.htm
üìä Nombre de discours trouv√©s en 2024 : 105
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/kugler20241203a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20241202a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20241122a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20241120a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/cook20241120a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/powell20241114a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/kugler20241114a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/waller20241112a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/bowman20241023a.htm
üîó Lien trouv√© : https://www.federalreserve.gov/newsevents/speech/w

Scraping des discours: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [04:39<00:00, 55.84s/it]


‚úÖ Scraping termin√© et sauvegard√© !
                date                                              title  \
0  December 18, 2020  Strengthening the Financial System to Meet the...   
1  December 17, 2020  Modernizing and Strengthening CRA Regulations:...   
2  December 11, 2020  The Eye of Providence: Thoughts on the Evoluti...   
3  December 04, 2020  Technology and the Regulatory Agenda for Commu...   
4  December 01, 2020  Modernizing and Strengthening CRA Regulations:...   

                                        speaker  \
0                        Governor Lael Brainard   
1                        Governor Lael Brainard   
2  Vice Chair for Supervision Randal K. Quarles   
3                   Governor Michelle W. Bowman   
4                        Governor Lael Brainard   

                                                text  
0  I want to thank the Center for American Progre...  
1  Good afternoon and thank you for inviting me t...  
2  Torrential thanks to our partners H

## Concat√©nation des donn√©es

In [1]:
import pandas as pd

In [12]:
path_1996_2020 = r"C:\Users\Lia\OneDrive - Universit√© Paris 1 Panth√©on-Sorbonne\Bureau\MASTER 2\S2\NLP\DATA\fed_speeches_1996_2020.csv"
df_1996_2020 = pd.read_csv(path_1996_2020)

In [23]:
df_1996_2020.shape

(1456, 9)

In [13]:
path_2020_2024 = r"C:\Users\Lia\OneDrive - Universit√© Paris 1 Panth√©on-Sorbonne\Bureau\MASTER 2\S2\NLP\DATA\fed_speeches.csv"
df_2020_2024 = pd.read_csv(path_2020_2024)

In [24]:
df_2020_2024.shape

(367, 4)

In [None]:
mask_valid_dates = df_1996_2020['date'].notna()

# 2. On convertit uniquement celles-ci (et laisse les autres √† NaT apr√®s parsing)
df_1996_2020.loc[mask_valid_dates, 'date'] = pd.to_datetime(
    df_1996_2020.loc[mask_valid_dates, 'date'].round().astype(int).astype(str),
    format='%Y%m%d',
    errors='coerce'
)

['1996-12-19 00:00:00', '1996-12-06 00:00:00', '1996-12-05 00:00:00',
 '1996-12-03 00:00:00', '1996-11-25 00:00:00', '1996-11-21 00:00:00',
 '1996-11-18 00:00:00', '1996-10-31 00:00:00', '1996-10-24 00:00:00',
 '1996-10-16 00:00:00',
 ...
 '2020-02-21 00:00:00', '2020-02-21 00:00:00', '2020-02-11 00:00:00',
 '2020-02-10 00:00:00', '2020-02-06 00:00:00', '2020-02-05 00:00:00',
 '2020-01-17 00:00:00', '2020-01-16 00:00:00', '2020-01-09 00:00:00',
 '2020-01-08 00:00:00']
Length: 1455, dtype: datetime64[ns]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_1996_2020.loc[mask_valid_dates, 'date'] = pd.to_datetime(


In [16]:
# Pour df_2020_2024 : convertir les dates au format datetime depuis texte (ex: "December 18, 2020")
df_2020_2024['date'] = pd.to_datetime(df_2020_2024['date'], format='%B %d, %Y', errors='coerce')

In [19]:
# S√©lection des colonnes communes
df_1996_2020_clean = df_1996_2020[['date', 'title', 'speaker', 'text']]
df_2020_2024_clean = df_2020_2024[['date', 'title', 'speaker', 'text']]

# Concat√©nation
df_all = pd.concat([df_1996_2020_clean, df_2020_2024_clean], ignore_index=True)

# Tri par date (optionnel mais pratique)
df_all = df_all.sort_values(by='date').reset_index(drop=True)

In [None]:
output_path = r"C:\Users\Lia\OneDrive - Universit√© Paris 1 Panth√©on-Sorbonne\Bureau\MASTER 2\S2\NLP\DATA\fed_speeches_1996_2024.csv"
df_all.to_csv(output_path, index=False, encoding='utf-8')