In [2]:
# imports
# data collection
import feedparser
import newspaper
import pandas as pd
from datetime import datetime
import time
import sys
import requests
from bs4 import BeautifulSoup

In [None]:
# FED Data Collection

# Set a higher recursion limit for newspaper3k when scraping many articles
# This helps prevent recursion errors on complex web pages.
sys.setrecursionlimit(2000)

rss_url = "https://www.federalreserve.gov/feeds/press_all.xml"
max_articles = 50  # Limit the number of articles to scrape for testing


feed = feedparser.parse(rss_url)
articles_data = []
    
# Iterate through entries, limiting the number of articles for a manageable run
for i, entry in enumerate(feed.entries):
    if i >= max_articles:
        print(f"Reached max_articles limit of {max_articles}.")
        break
            
    try:
            # 1. Extract link and basic metadata from RSS entry
        link = entry.link
        title = entry.title
            
        # Extract date using feedparser's published_parsed
        published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
            
        # 2. Use newspaper3k to get the full article text
        article = newspaper.Article(link)
        article.download()
        article.parse()
            
        # CRUCIAL: Check if full text was successfully extracted
        if article.text:
            articles_data.append({
                'source': 'FED',
                'title': title,
                    'link': link,
                    'date': published_date,
                    'full_text': article.text
                })
        print(f"[{i+1}/{max_articles}] Successfully scraped: {title[:70]}...")
            
    except Exception as e:
    # Skip if an article link is broken or parsing fails
        print(f"[{i+1}/{max_articles}] Error scraping article at {entry.link}: {e}. Skipping.")
        continue

fed_df = pd.DataFrame(articles_data)
print(fed_df[['title', 'date', 'full_text']].head())

In [None]:
# ECB Data Collection
# Increase recursion limit to handle potential deep HTML structures
sys.setrecursionlimit(2000)

# The ECB RSS link providing the data you shared (Press releases, speeches, etc.)
ECB_COMBINED_RSS = "https://www.ecb.europa.eu/rss/press.html"

def scrape_full_articles_from_ecb(rss_url, max_articles=20):
    """
    Parses the ECB combined RSS feed, applies filters, and uses newspaper3k
    to extract the full article text from standard HTML links.7
    """
    print(f"--- Starting scrape for ECB Combined RSS: {rss_url} ---")
    
    feed = feedparser.parse(rss_url)
    articles_data = []
    
    for i, entry in enumerate(feed.entries):
        if i >= max_articles:
            print(f"Reached max_articles limit of {max_articles}.")
            break
            
        link = entry.link
        
        # ⚠️ CRITICAL FILTERING STEP:
        # 1. Skip PDF links directly (e.g., Philip R. Lane's contribution)
        # 2. Skip internal Decisions (e.g., Decisions taken by the Governing Council) as they are often just short notices.
        if link.endswith('.pdf') or '/govcdec/' in link:
             print(f"[{i+1}/{max_articles}] Skipping PDF or Decision link: {entry.title[:40]}...")
             continue
        
        try:
            title = entry.title
            
            # Safely handle the date parsing
            published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
            
            article = newspaper.Article(link)
            article.download()
            
            # A short wait can help with complex sites
            time.sleep(0.5) 
            
            article.parse()
            
            # Ensure the extracted text is substantial (e.g., over 200 characters)
            if article.text and len(article.text) > 200: 
                articles_data.append({
                    'source': 'ECB',
                    'title': title,
                    'link': link,
                    'date': published_date,
                    'full_text': article.text
                })
                print(f"[{i+1}/{max_articles}] Successfully scraped: {title[:70]}...")
            
        except Exception as e:
            # Catch exceptions like ReadTimeout or ParsingError
            print(f"[{i+1}/{max_articles}] Error scraping article at {link}: {e}. Skipping.")
            continue

    return pd.DataFrame(articles_data)

# --- EXECUTION ---
ecb_df = scrape_full_articles_from_ecb(ECB_COMBINED_RSS, max_articles=15)

print("\n--- Verified ECB Scrape Complete ---")
print(f"Total articles scraped: {len(ecb_df)} (after filtering)")
print("\nDataFrame Head:")
print(ecb_df[['title', 'date', 'full_text']].head())

In [4]:
# Fraser Data Collection

def scrape_financial_crisis_timeline(url):
    """Scrapes date, title, description, and link info from the timeline."""
    try:
        # 1. Download the HTML content
        response = requests.get(url)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        # 2. Parse the HTML
        soup = BeautifulSoup(response.content, 'html.parser')

        # 3. Find the main container element
        # Your target container is class="timeline-events clusterize-scroll" and id="list-container"
        timeline_container = soup.find('div', id='list-container')
        
        if not timeline_container:
            print("Error: Main timeline container not found.")
            return []

        # 4. Find all individual event rows
        # The articles are inside <div class="row event-row active">
        event_rows = timeline_container.find_all('div', class_='event-row')

        data = []
        for row in event_rows:
            # 5. Extract data points using the specific classes
            
            # Date and Source/Title: <h2 class="list-item">
            header_element = row.find('h2', class_='list-item')
            header_text = header_element.text.strip() if header_element else 'N/A'
            
            # Description/Summary: <p class="list-item">
            summary_element = row.find('p', class_='list-item')
            summary = summary_element.text.strip() if summary_element else 'N/A'
            
            # Associated Link: <ul><li><a href="..." class="list-item">
            link_element = row.find('a', class_='list-item')
            
            link_title = link_element.text.strip() if link_element else 'N/A'
            link_url = link_element['href'] if link_element else 'N/A'
            
            # Prepend the base URL if the link is relative
            if link_url != 'N/A' and link_url.startswith('/'):
                link_url = 'https://fraser.stlouisfed.org' + link_url
            
            # Split the header into Date and Source
            if '|' in header_text:
                date, source = [part.strip() for part in header_text.split('|', 1)]
            else:
                date = header_text
                source = 'N/A'

            data.append({
                'Date': date,
                'Source': source,
                'Summary': summary,
                'Associated Link Title': link_title,
                'Associated Link URL': link_url
            })

        return data

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the request: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# --- Execution ---
URL = "https://fraser.stlouisfed.org/timeline/financial-crisis"
fraser_timeline_data = scrape_financial_crisis_timeline(URL)

# Output the results (first 5 entries)
if fraser_timeline_data:
    df = pd.DataFrame(fraser_timeline_data)
    print(f"Scraped {len(df)} events.")
    print("\n--- First 5 Scraped Events ---")
    print(df.head().to_markdown(index=False))
else:
    print("Failed to scrape data.")


Scraped 305 events.

--- First 5 Scraped Events ---
| Date              | Source                                        | Summary                                                                                                                                                          | Associated Link Title                                                                                                                                                                                                                  | Associated Link URL                                                              |
|:------------------|:----------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------