In [3]:
import pandas as pd
import numpy as np
import re
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import datetime
import os
import logging
from scipy.optimize import minimize
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm

Matplotlib is building the font cache; this may take a moment.


In [4]:
import warnings
from IPython.core.interactiveshell import InteractiveShell
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all" 

Data Cleaning and Preprocessing

In [6]:
data = pd.read_excel("SPX.xlsx")
data['日期'] = pd.to_datetime(data['日期']).dt.date
data.columns = ['Code', 'Name', 'Date', 'Open', 'High', 'Low', 'Close', 'Change', 'Volume', 'Turnover_Mil']
data = data[data['Date'] >=  datetime.date(2020, 1, 1)]
# data['Date'] = data['Date'].dt.date
data

Unnamed: 0,Code,Name,Date,Open,High,Low,Close,Change,Volume,Turnover_Mil
23368,SPX.GI,标普500,2020-01-02,3244.67,3258.14,3235.53,3257.85,0.0084,5.902449e+09,0.00
23369,SPX.GI,标普500,2020-01-03,3226.36,3246.15,3222.34,3234.85,-0.0071,5.637120e+09,0.00
23370,SPX.GI,标普500,2020-01-06,3217.55,3246.84,3214.64,3246.28,0.0035,6.048874e+09,0.00
23371,SPX.GI,标普500,2020-01-07,3241.86,3244.91,3232.43,3237.18,-0.0028,5.512627e+09,0.00
23372,SPX.GI,标普500,2020-01-08,3238.59,3267.07,3236.67,3253.05,0.0049,5.806283e+09,0.00
...,...,...,...,...,...,...,...,...,...,...
24722,SPX.GI,标普500,2025-04-28,5529.22,5553.66,5468.64,5528.75,0.0006,2.750592e+09,314036.58
24723,SPX.GI,标普500,2025-04-29,5508.87,5571.95,5505.70,5560.83,0.0058,2.852734e+09,295142.68
24724,SPX.GI,标普500,2025-04-30,5499.44,5581.84,5433.24,5569.06,0.0015,3.745265e+09,401116.95
24725,SPX.GI,标普500,2025-05-01,5625.14,5658.91,5597.35,5604.14,0.0063,3.216172e+09,386018.52


In [12]:
folder = 'FOMC_Txt'
filenames = os.listdir(folder)
date_pattern = re.compile(r'(\d{8})')
fomc_dates = []

for f in filenames:
    match = date_pattern.search(f)
    if match:
        try:
            fomc_dates.append(pd.to_datetime(match.group(1), format='%Y%m%d'))
        except Exception as e:
            print(f"Error parsing {f}: {e}")

fomc_dates = sorted(fomc_dates)
fomc_dates = [d.date() for d in fomc_dates]
# fomc_dates

results = []

for fomc_date in fomc_dates:
    # print("fomc_date:",fomc_date)
    after_dates = data[data['Date'] > fomc_date] # find next tradingdate
    if not after_dates.empty:
        next_trading_day = after_dates.iloc[0]
        previous_day = data[data['Date'] <= fomc_date].iloc[-1]
        
        change = (next_trading_day['Close'] - previous_day['Close']) / previous_day['Close']
        
        results.append({
            'FOMC_Date': fomc_date,
            'Prev_Close': previous_day['Close'],
            'Next_Trading_Date': next_trading_day['Date'],
            'Next_Close': next_trading_day['Close'],
            'pct': round(change * 100, 2)
        })
fomc_change_df = pd.DataFrame(results)


In [15]:
fomc_change_df.head()

Unnamed: 0,FOMC_Date,Prev_Close,Next_Trading_Date,Next_Close,pct
0,2020-01-29,3273.4,2020-01-30,3283.66,0.31
1,2020-03-15,2711.02,2020-03-16,2386.13,-11.98
2,2020-04-29,2939.51,2020-04-30,2912.43,-0.92
3,2020-06-10,3190.14,2020-06-11,3002.1,-5.89
4,2020-07-29,3258.44,2020-07-30,3246.22,-0.38


In [16]:
def classify_change(pct):
    """
    Classifies daily percent change into 4 classes:
    0 = sharp drop, 1 = mild drop, 2 = mild rise, 3 = sharp rise
    """
    if pct > 3:
        return 3  # Sharp Rise
    elif pct > 0:
        return 2  # Mild Rise
    elif pct > -3:
        return 1  # Mild Drop
    else:
        return 0  # Sharp Drop

# Apply to DataFrame
fomc_change_df['label'] = fomc_change_df['pct'].apply(classify_change)

In [17]:
fomc_change_df.head()

Unnamed: 0,FOMC_Date,Prev_Close,Next_Trading_Date,Next_Close,pct,label
0,2020-01-29,3273.4,2020-01-30,3283.66,0.31,2
1,2020-03-15,2711.02,2020-03-16,2386.13,-11.98,0
2,2020-04-29,2939.51,2020-04-30,2912.43,-0.92,1
3,2020-06-10,3190.14,2020-06-11,3002.1,-5.89,0
4,2020-07-29,3258.44,2020-07-30,3246.22,-0.38,1


This labeling scheme maps market reactions into qualitative sentiment classes:

label = 3: Market had a strong positive response (e.g., +4.1%) \
label = 2: Market had a moderate positive response (e.g., +0.9%)\
label = 1: Market had a moderate decline (e.g., -1.4%)\
label = 0: Market had a strong negative response (e.g., -11.9%)

In [18]:
# Helper function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)               # Normalize whitespace
    text = re.sub(r'[^a-zA-Z ]', '', text)         # Remove numbers and punctuation
    return text.lower().strip()

# Build a dictionary: date → cleaned speech
speech_texts = {}
folder = 'FOMC_Txt'

for f in filenames:
    match = date_pattern.search(f)
    if match:
        try:
            date = pd.to_datetime(match.group(1), format='%Y%m%d').date()
            with open(os.path.join(folder, f), 'r', encoding='utf-8') as file:
                raw_text = file.read()
                speech_texts[date] = clean_text(raw_text)
        except Exception as e:
            print(f"Error reading {f}: {e}")

Check to make sure:

In [19]:
for date in fomc_change_df['FOMC_Date'].head():
    print(f"\n=== {date} ===\n{speech_texts.get(date, 'Missing')[:300]}")



=== 2020-01-29 ===
january   chair powells press conference final transcript of chair powells press conference january   chair powell good afternoon  everyone thanks for being here at todays meeting my colleagues and i decided to leave our policy rate unchanged as always we base our decisions on our judgment of how be

=== 2020-03-15 ===
march   chair powells press conference call final transcript of chair powell s press conference call march    chair powell good evening everyone today the federal reserve took a number of actions to support american families and business and the economy overall and to promote the flow of credit as w

=== 2020-04-29 ===
april   chair powells press conference final transcript of chair powells press conference april    chair powell good afternoon thanks for joining us today i would like to begin by acknowledging the tragic loss and tremendous hardship that people are experiencing both here in the united states and ar

=== 2020-06-10 ===
june   chair powe

In [20]:
# Merge speech texts into your DataFrame
fomc_change_df['speech'] = fomc_change_df['FOMC_Date'].map(speech_texts)

# Check if any are missing
missing_speeches = fomc_change_df[fomc_change_df['speech'].isnull()]
print(f"Missing speeches: {len(missing_speeches)}")
missing_speeches[['FOMC_Date']]

Missing speeches: 0


Unnamed: 0,FOMC_Date


In [28]:
fomc_change_df.head()

Unnamed: 0,FOMC_Date,Prev_Close,Next_Trading_Date,Next_Close,pct,label,speech
0,2020-01-29,3273.4,2020-01-30,3283.66,0.31,2,january chair powells press conference final...
1,2020-03-15,2711.02,2020-03-16,2386.13,-11.98,0,march chair powells press conference call fi...
2,2020-04-29,2939.51,2020-04-30,2912.43,-0.92,1,april chair powells press conference final t...
3,2020-06-10,3190.14,2020-06-11,3002.1,-5.89,0,june chair powells press conference final tr...
4,2020-07-29,3258.44,2020-07-30,3246.22,-0.38,1,july chair powells press conference final tr...


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(fomc_change_df['speech'])
y = fomc_change_df['label']

feature_names = vectorizer.get_feature_names_out()

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(multi_class='ovr', solver='liblinear', max_iter=1000)
model.fit(X_train, y_train)


In [35]:
import pandas as pd
import numpy as np

def get_top_words_for_class(class_index, top_n=10):
    coefs = model.coef_[class_index]
    coef_df = pd.DataFrame({'word': feature_names, 'coef': coefs})
    top_words = coef_df.sort_values(by='coef', ascending=False).head(top_n)
    return top_words

# # Example: Top words for class 3 (sharp rise)
# top_words_class_3 = get_top_words_for_class(3, top_n=15)
# print("Top predictive words for class 3 (sharp rise):")
# print(top_words_class_3)

# Repeat for class 0 (sharp drop)
top_words_class_0 = get_top_words_for_class(0, top_n=15)
print("\nTop predictive words for class 0 (sharp drop):")
print(top_words_class_0)


Top predictive words for class 0 (sharp drop):
             word      coef
447          june  0.192971
905         tools  0.106271
484     liquidity  0.104788
515       markets  0.082411
521           mbs  0.076768
314          flow  0.076649
182        credit  0.073999
512         march  0.069470
587          okay  0.068691
922       ukraine  0.067676
442           job  0.060533
98     businesses  0.058384
188         curve  0.054218
930  unemployment  0.053893
331   functioning  0.053270


In [33]:
print("Classes in model:", model.classes_)
print("Shape of model.coef_:", model.coef_.shape)

Classes in model: [0 1 2]
Shape of model.coef_: (3, 1000)


In [34]:
print(fomc_change_df['label'].value_counts())

label
1    17
2    15
0     4
Name: count, dtype: int64
