In [1]:
import re
from gensim import corpora
import gensim
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import tqdm
import zlib
import base64
import os
import nltk
import pandas as pd
from summarizer import Summarizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

from sentence_transformers import SentenceTransformer
import numpy as np


from nltk import bigrams




In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

dir_speeches  = 'compressed_fomc_minutes'


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to C:\Users\Gustavo
[nltk_data]     Castro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Gustavo
[nltk_data]     Castro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Gustavo
[nltk_data]     Castro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def preprocess_text(text):
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # Remove duplicate tokens
    tokens = list(dict.fromkeys(tokens))  # Remove duplicates while preserving order
    return ' '.join(tokens)

In [4]:
documents = {}
#read all files in the directory
for filename in os.listdir(dir_speeches):
    if filename.endswith('.txt'):
        with open(os.path.join(dir_speeches, filename), 'r', encoding="utf8") as file:
            text = file.read()
            #print('Original Text:', text)
            decompressed = zlib.decompress(base64.b64decode(text.encode())).decode()
            decompressed = preprocess_text(decompressed)
            
            # make decompressed into a string
           
            documents[filename] = decompressed
            


In [5]:
# make dataframe
df = pd.DataFrame.from_dict(documents, orient='index')
df['time'] = df.index.str.replace('.txt', '').str.replace('compressed_fomc_minutes', ' ')
df['time'] = pd.to_datetime(df['time'])
df.index = range(len(df))
df = df.sort_values(by='time')
print(df.head())

                                                   0       time
0  meeting federal open market committee held off... 2000-02-02
1  meeting federal open market committee held off... 2000-03-21
2  meeting federal open market committee held off... 2000-05-16
3  meeting federal open market committee held off... 2000-06-28
4  meeting federal open market committee held off... 2000-08-22


In [6]:
df['t+1'] = df['time'].shift(-1)
df['t+2'] = df['time'].shift(-2)
df["Text"] = df[0]


In [30]:
#get interest rate data 

df_economic_data = pd.read_csv('DFEDTAR.csv', index_col=0)
#date, value
df_economic_data.index = pd.to_datetime(df_economic_data.index)
df_economic_data.reset_index(inplace=True)
#get the interest rate for the values in df
df['t+1'] = pd.to_datetime(df['t+1'])

#match the interest rate to the meeting date in df
merged_df = pd.merge(df[['Text', 'time']], df_economic_data.reset_index(), left_on='time', right_on='DATE', how='left')
#rename DFEDTAR to IR
merged_df = merged_df.rename(columns={'DFEDTAR': 'rate'})
merged_df.head()

Unnamed: 0,Text,time,index,DATE,rate
0,meeting federal open market committee held off...,2000-02-02,154,2000-02-02,5.75
1,meeting federal open market committee held off...,2000-03-21,202,2000-03-21,6.0
2,meeting federal open market committee held off...,2000-05-16,258,2000-05-16,6.5
3,meeting federal open market committee held off...,2000-06-28,301,2000-06-28,6.5
4,meeting federal open market committee held off...,2000-08-22,356,2000-08-22,6.5


In [31]:
merged_df['rateChange'] = merged_df['rate'].shift(-1) - merged_df['rate']
# Drop rows with NaN values in 'rateChange' (optional, depending on your needs)
merged_df.dropna(subset=['rateChange'], inplace=True)
merged_df.drop('index', axis=1, inplace=True)

#  drop the redundant time column after merging
merged_df.drop('time', axis=1, inplace=True)
#drop index


In [32]:
#save df to csv
merged_df.to_csv('merged_df.csv', index=False)

In [33]:
#Step 2: Generate Sentence Embeddings with SBERT
import torch
import torch.nn as nn
import torch.optim as optim
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, f1_score
#get xgboost model
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Load the pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can use other pre-trained models as well

# Example sentences (replace these with sentences from FED publications)
sentences = merged_df['Text'].values

# Generate embeddings for the sentences
embeddings = model.encode(sentences)

# Convert embeddings to numpy array for easier handling
embeddings = np.array(embeddings)

# Load the data
x = embeddings
y = merged_df['rateChange']
y = [1 if x > 0 else 0 if x == 0 else 2 for x in y]


In [34]:

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.4, random_state=42
)


# Define the XGBoost model
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', seed=42)

# Define parameter grid for GridSearch
param_grid = {
    'max_depth': [3, 6, 9],
    'eta': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'lambda': [0, 0.1, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=3,  # 3-fold cross-validation
    verbose=1,
    n_jobs=-1  # Use all available cores
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters: ", grid_search.best_params_)
print("Best F1 Score: ", grid_search.best_score_)

# Use the best model to predict on the test set
best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)



Fitting 3 folds for each of 729 candidates, totalling 2187 fits
Best Parameters:  {'colsample_bytree': 0.8, 'eta': 0.2, 'gamma': 0.2, 'lambda': 1.0, 'max_depth': 3, 'subsample': 0.8}
Best F1 Score:  0.6504633801367015


In [35]:
# Evaluate the model
f1 = f1_score(y_test, preds, average='weighted')
print(f'F1 Score: {f1:.4f}')
print(classification_report(y_test, preds))

F1 Score: 0.7879
              precision    recall  f1-score   support

           0       0.84      0.87      0.86        55
           1       0.67      0.50      0.57        12
           2       0.64      0.70      0.67        10

    accuracy                           0.79        77
   macro avg       0.72      0.69      0.70        77
weighted avg       0.79      0.79      0.79        77

