In [8]:
# import all dependencies and assignment of variables
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import re
import time
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import openai
from dotenv import load_dotenv
# Configuration
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
assert openai.api_key is not None
BASE_URL = "https://www.federalreserve.gov"
CALENDAR_URL = f"{BASE_URL}/monetarypolicy/fomccalendars.htm"
START_YEAR = 2024
OUTPUT_DIR = "minutes_texts"
print('Done Loading Libraries')

Done Loading Libraries!


In [5]:
# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
print('Done Loading Models')

Done Loading Models


In [None]:
# Functions
def get_finbert_sentiment(text, chunk_size=512):
    #Get FinBERT sentiment scores for text
    # Split text into chunks to handle long documents
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    sentiment_scores = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        scores = torch.nn.functional.softmax(outputs.logits, dim=1)
        sentiment_scores.append(scores.detach().numpy()[0])
    avg_scores = np.mean(sentiment_scores, axis=0)
    return {'positive': float(avg_scores[0]),'negative': float(avg_scores[1]),'neutral': float(avg_scores[2])}

def scrape_and_save_minutes():
    #Scrape FOMC minutes and save them to text files
    try:
        print("Fetching FOMC calendar page...")
        response = requests.get(CALENDAR_URL)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        panels = soup.find_all('div', class_='panel-default')
        minutes_dict = {}
        for panel in panels:
            heading = panel.find('div', class_='panel-heading')
            if not heading: continue
            year_match = re.search(r'(\d{4})', heading.get_text())
            if not year_match: continue
            year = int(year_match.group(1))
            if year < START_YEAR: continue
            print(f"\nProcessing year {year}...")
            meeting_rows = panel.find_all('div', class_='fomc-meeting')
            print(f"Found {len(meeting_rows)} meeting rows")
            for row in meeting_rows:
                minutes_div = row.find('div', class_='fomc-meeting__minutes')
                if not minutes_div: continue
                links = minutes_div.find_all('a')
                html_link = next((link for link in links if 'htm' in link.get('href', '').lower()), None)
                if html_link and html_link.get('href'):
                    full_url = f"{BASE_URL}{html_link['href']}"
                    date = re.search(r'minutes(\d{8})', full_url)
                    if date:
                        date_str = date.group(1)
                        formatted_date = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
                        print(f"Fetching minutes from {formatted_date}")
                        response = requests.get(full_url)
                        response.raise_for_status()
                        soup = BeautifulSoup(response.text, 'html.parser')
                        article = soup.find('div', id='article')
                        if article:
                            paragraphs = article.find_all('p')
                            text = ' '.join(p.get_text(strip=True) for p in paragraphs)
                            if text:
                                minutes_dict[formatted_date] = (full_url, text)
                                os.makedirs(OUTPUT_DIR, exist_ok=True)
                                with open(os.path.join(OUTPUT_DIR, f"minutes_{formatted_date}.txt"), 'w', encoding='utf-8') as f:
                                    f.write(text)
                                print(f"Saved minutes for {formatted_date} ({len(text)} characters)")
                                time.sleep(1)
        print(f"\nAvailable dates:")
        for date in sorted(minutes_dict.keys()):
            url, text = minutes_dict[date]
            print(f"{date}: {len(text)} characters")
        return minutes_dict
    except requests.RequestException as e:
        print(f"Error: {e}")
        return {}

def analyze_minutes_and_yields():
    #Analyze minutes sentiment and yield changes [positive, negative, neutral]
    hawkish_mapping = np.array([1, -0.5, 0])  # negative sentiment more hawkish
    sentiment_results,yield_results = {},{}
    # Calculate sentiment scores for all documents
    for filename in os.listdir(OUTPUT_DIR):
        if filename.startswith("minutes_") and filename.endswith(".txt"):
            print(f"Reading: {filename}")
            date = filename[8:-4]
            with open(os.path.join(OUTPUT_DIR, filename), 'r', encoding='utf-8') as f:
                text = f.read()
            # Get FinBERT sentiment scores as array
            scores = get_finbert_sentiment(text)
            scores_array = np.array([scores['positive'], scores['negative'], scores['neutral']])
            # Calculate hawkish score using dot product => This will give a score between -1 and 1
            hawkish_score = np.dot(scores_array, hawkish_mapping)
            sentiment_results[date] = hawkish_score
    # Calculate yield changes
    for date in sentiment_results.keys():
        try:
            start_date = datetime.strptime(date, '%Y-%m-%d')
            end_date = start_date + timedelta(days=7)
            print(f"Fetching yields for {date}...")
            data = yf.download('^TNX', start=start_date, end=end_date, progress=False)
            if not data.empty:
                initial_yield = float(data['Close'].iloc[0])  # Convert to float
                final_yield = float(data['Close'].iloc[-1])   # Convert to float
                yield_results[date] = round(final_yield - initial_yield, 2)
                print(f"Successfully got yield change")
            else:
                print(f"No yield data available for {date}")
                yield_results[date] = None
        except Exception as e:
            print(f"Error getting yield data for {date}: {e}")
            yield_results[date] = None
    df = pd.DataFrame({'Hawkish Score': sentiment_results,'10Y_Change': yield_results}).sort_index()
    print("FOMC Minutes Analysis")
    print("=" * 80)
    print(df)
    # Create scatter plot
    if not df['10Y_Change'].isna().all():
        plt.figure(figsize=(10, 6))
        valid_data = df.dropna()
        if not valid_data.empty:
            sns.scatterplot(data=valid_data, x='Hawkish Score', y='10Y_Change')
            plt.title('Hawkish Score vs 10Y Treasury Yield Change')
            plt.xlabel('Hawkish Score (-1 = Dove, 1 = Hawk)')
            plt.ylabel('10Y Yield Change (bps)')
            x = valid_data['Hawkish Score']
            y = valid_data['10Y_Change']
            z = np.polyfit(x, y, 1)
            p = np.poly1d(z)
            plt.plot(x, p(x), "r--", alpha=0.8)
            correlation = valid_data['Hawkish Score'].corr(valid_data['10Y_Change'])
            plt.annotate(f'Correlation: {correlation:.2f}', xy=(0.05, 0.95), xycoords='axes fraction',bbox=dict(facecolor='white', edgecolor='black', alpha=0.7))
            plt.tight_layout()
            plt.show()
    return df

def query_minutes(date, question):
    try:
        file_path = os.path.join(OUTPUT_DIR, f"minutes_{date}.txt")
        if not os.path.exists(file_path):
            return f"No minutes found for date {date}"
        with open(file_path, 'r', encoding='utf-8') as f:
            minutes_text = f.read()
        system_prompt_content = "You are an expert in analyzing Federal Reserve communications."
        user_prompt_content = f"Below are FOMC minutes from a meeting. Please answer the question about these minutes based only on the information provided.\n\nMinutes:\n{minutes_text}\n\nQuestion: {question}\n\nAnswer:"
        messages = [
            {"role": "system", "content": system_prompt_content},
            {"role": "user", "content": user_prompt_content}
        ]
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.3
        )
        answer = response['choices'][0]['message']['content']
        return answer.strip()
    except Exception as e:
        return f"Error processing query: {str(e)}"

def evaluate_sentiment_predictions(df):
    eval_df = df.copy()
    eval_df.dropna(subset=['Hawkish Score', '10Y_Change'], inplace=True)
    eval_df = eval_df[(eval_df['Hawkish Score'] != 0) & (eval_df['10Y_Change'] != 0)]
    if eval_df.empty:
        print("No data available for evaluation after filtering.")
        return
    eval_df['Predicted_Label'] = np.where(eval_df['Hawkish Score'] > 0, 1, 0)
    eval_df['Actual_Label'] = np.where(eval_df['10Y_Change'] > 0, 1, 0)
    cm_labels = [0, 1] # 0: Dovish, 1: Hawkish
    cm = confusion_matrix(eval_df['Actual_Label'], eval_df['Predicted_Label'], labels=cm_labels)
    accuracy = accuracy_score(eval_df['Actual_Label'], eval_df['Predicted_Label'])
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',xticklabels=['Dovish (0)', 'Hawkish (1)'],yticklabels=['Dovish (0)', 'Hawkish (1)'],cbar=False)
    plt.xlabel('Actual Label')
    plt.ylabel('Predicted Label')
    plt.title(f'Sentiment Prediction Confusion Matrix\nAccuracy: {accuracy*100:.2f}%')
    plt.tight_layout()
    plt.show()



In [None]:
# Scrap minutes from the web:
minutes_dict = scrape_and_save_minutes()  # Only needed first time or to update

In [None]:
# Analyse (minutes text retrieved before) => Dovish/Hawkish and Evaluate classification
df=analyze_minutes_and_yields() 
evaluate_sentiment_predictions(df)


In [None]:
# Query specific minutes (Chatbot)
date = "2022-01-26"
question = "Can you find the most dovish and the most hawkish satements in such text and classify it yourself as one label or the other?"
answer = query_minutes(date, question)
print(f"\nQuestion: {question}")
print(f"Answer: {answer}")


