In [None]:
# imports
# data collection
import feedparser
import newspaper
import pandas as pd
from datetime import datetime
import time
import sys

# data processing
import torch
import numpy as np
from langchain_community.vectorstores import FAISS  # For storing and retrieving embeddings using the FAISS library
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

In [11]:
# Data Collection

# Set a higher recursion limit for newspaper3k when scraping many articles
# This helps prevent recursion errors on complex web pages.
sys.setrecursionlimit(2000)

rss_url = "https://www.federalreserve.gov/feeds/press_all.xml"
max_articles = 50  # Limit the number of articles to scrape for testing


feed = feedparser.parse(rss_url)
articles_data = []
    
# Iterate through entries, limiting the number of articles for a manageable run
for i, entry in enumerate(feed.entries):
    if i >= max_articles:
        print(f"Reached max_articles limit of {max_articles}.")
        break
            
    try:
            # 1. Extract link and basic metadata from RSS entry
        link = entry.link
        title = entry.title
            
        # Extract date using feedparser's published_parsed
        published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
            
        # 2. Use newspaper3k to get the full article text
        article = newspaper.Article(link)
        article.download()
        article.parse()
            
        # CRUCIAL: Check if full text was successfully extracted
        if article.text:
            articles_data.append({
                'source': 'FED',
                'title': title,
                    'link': link,
                    'date': published_date,
                    'full_text': article.text
                })
        print(f"[{i+1}/{max_articles}] Successfully scraped: {title[:70]}...")
            
    except Exception as e:
    # Skip if an article link is broken or parsing fails
        print(f"[{i+1}/{max_articles}] Error scraping article at {entry.link}: {e}. Skipping.")
        continue

fed_df = pd.DataFrame(articles_data)
print(fed_df[['title', 'date', 'full_text']].head())

[1/50] Successfully scraped: Federal Reserve Board issues enforcement actions with Belt Valley Bank...
[2/50] Successfully scraped: Federal Reserve Board finalizes changes to its supervisory rating fram...
[3/50] Successfully scraped: Federal Reserve Board announces termination of enforcement actions wit...
[4/50] Successfully scraped: Federal Reserve Board announces approval of application by First Secur...
[5/50] Successfully scraped: Federal Reserve Board issues enforcement actions with former employee ...
[6/50] Successfully scraped: Federal Reserve issues FOMC statement...
[7/50] Successfully scraped: Federal Reserve Board requests comment on proposals to enhance the tra...
[8/50] Successfully scraped: Federal Reserve and FDIC release public sections of resolution plans f...
[9/50] Successfully scraped: Federal Reserve Board denies application by Canandaigua National Corpo...
[10/50] Successfully scraped: Federal Reserve Board announces approval of application by HPB Holding...
[1

In [None]:
# Embedding Generation with FinBERT

# Use example text from european central bank
text = "New data release: ECB wage tracker suggests lower and more stable wage pressures in the first three quarters of 2026"

# A specific model for generating embeddings, distinct from the sentiment classifier
model_name = "ProsusAI/finbert" 

tokenizer = AutoTokenizer.from_pretrained(model_name)
# Using AutoModel for embeddings (not AutoModelForSequenceClassification, which is for sentiment classification)
model = AutoModel.from_pretrained(model_name)
    
# Check for GPU and move model if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
    
print(f"\n--- Generating FinBERT Embeddings on {device} ---")
    
embeddings = []
    
inputs = tokenizer(
    text, 
    return_tensors="pt", 
    padding='max_length', 
    truncation=True, 
    max_length=512
).to(device)

with torch.no_grad():
    # 3. Get model outputs
    outputs = model(**inputs)
    # The last hidden state contains the final embeddings for all tokens
    last_hidden_state = outputs.last_hidden_state
            
    # 4. Extract the [CLS] token vector as the sentence embedding
    # [CLS] token is at index 0, and we squeeze to remove the batch dimension
    cls_embedding = last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    embeddings.append(cls_embedding)

print(np.array(embeddings))


--- Generating FinBERT Embeddings on cpu ---
[[ 2.25709468e-01 -8.65656137e-01  9.18066263e-01 -1.15118660e-01
  -2.93852508e-01  1.06379604e+00  3.89798701e-01  5.21846831e-01
   4.13837850e-01 -4.89027202e-01 -9.79870975e-01 -5.15758038e-01
   3.54133159e-01 -2.44752094e-01  2.96717882e-01 -3.24833244e-02
  -4.91769344e-01 -6.09276474e-01 -6.82388783e-01  1.48150772e-01
  -2.13401213e-01  6.97376370e-01  9.09314215e-01 -1.14612706e-01
  -2.42477089e-01 -2.50007302e-01 -2.58692294e-01 -9.74490494e-02
  -6.77596986e-01  1.11994714e-01  1.87651277e-01  8.14433098e-01
  -9.01214838e-01  3.29174221e-01  5.76986730e-01 -1.68087289e-01
   9.76640135e-02 -4.31355536e-02  4.07877207e-01  1.43404916e-01
  -7.39271104e-01 -4.72302318e-01  5.45792401e-01  1.04079746e-01
  -2.51851708e-01 -8.82413015e-02 -2.73090887e+00 -1.42774642e+00
  -5.30757725e-01  2.14620292e-01 -8.67506936e-02 -4.93232071e-01
  -3.28291138e-03  9.89936590e-02 -4.51951385e-01  3.64455551e-01
  -1.54333189e-02 -2.56984115e