In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer
import numpy as np
import torch
import re
import pandas as pd
from nltk.tokenize import sent_tokenize
from FedTools import FederalReserveMins

### This transformer model is computationally expensive and FOMC minutes had to be batch processed

This Transformer Sentiment model relies on the Finbert financial language library to categorize sentences as positive, negative or neutral with respect to sentiment. To determine overall document sentiment the neutral sentences are omitted and the count of positive sentences within a given FOMC release are divided by the total count of negative sentences. These values had to be batch processed and are appended in the FED_NLP_bombine downstream notebook

Implement Cuda if Available


In [2]:
torch.cuda.is_available()
torch.__version__
device = "cuda:0" if torch.cuda.is_available() else "cpu"

True

In [13]:
def transformer_func(j):

    sentiment_score = []



    finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)  #.to(device) pre trained model for financial data
    tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

    text = filtered_df.Federal_Reserve_Mins[j]
    out = re.split('\. |\!',text)

    sent_out = []
    labels = {0:'neutral', 1:'positive',2:'negative'}

    for i in out:
        sent_out.append(i.replace('\n',''))
    tokenized_sentence = tokenizer(sent_out, padding = True, truncation = True, return_tensors='pt',max_length=50) # .to(device)
    outputs = finbert(**tokenized_sentence)[0]
    preds = (torch.nn.functional.softmax(outputs, dim=-1))*100 # convert to percentage

    
    sentiment= []
    positive_rating = []
    neutral_rating = []
    negative_rating = []
    for idx, sent in enumerate(sent_out):
        #print(sent, '----', labels[np.argmax(preds.detach().numpy()[idx])])
        #preds.cpu()
        sentiment.append(labels[np.argmax(preds.detach().numpy()[idx])])
        neutral_rating.append(preds.detach().numpy()[idx][0])
        positive_rating.append(preds.detach().numpy()[idx][1])
        negative_rating.append(preds.detach().numpy()[idx][2])


    df = pd.DataFrame(sent_out)
    df = df.rename(columns={0:'Sentence'})
    df['Sentiment'] = sentiment
    df['Neutral Probability'] = neutral_rating
    df['Positive Probability'] = positive_rating
    df['Negative Probability'] = negative_rating
    df_extract = df[df['Sentiment']!='neutral']

    sentiment_totals = df.groupby(['Sentiment']).size()
    pos_neg_ratio = (sentiment_totals[2]/sentiment_totals[0])
   
    print('Positive to Negative Sentiment ratio',pos_neg_ratio)
    return pos_neg_ratio

## Pull FOMC documents from FedTools

In [7]:
fed_mins = FederalReserveMins(
            main_url = 'https://www.federalreserve.gov', 
            calendar_url ='https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm',
            start_year = 2003,        
            historical_split = 2012,
            verbose = True,
            thread_num = 10)

mins_dataset = fed_mins.find_minutes()

Constructing links between 2003 and 2023
Extracting Federal Reserve Minutes.
Retrieving articles.
...............................................................................................................................

### Check to see if it parsed correctly

In [8]:
mins_dataset= mins_dataset.reset_index()
mins_dataset

Unnamed: 0,index,Federal_Reserve_Mins
0,2003-01-29,A meeting of the Federal Open Market Committee...
1,2003-03-18,A meeting of the Federal Open Market Committee...
2,2003-05-06,A meeting of the Federal Open Market Committee...
3,2003-06-25,A meeting of the Federal Open Market Committee...
4,2003-08-12,A meeting of the Federal Open Market Committee...
...,...,...
118,2022-12-14,"The Federal Reserve, the central bank of the U..."
119,2023-02-01,"The Federal Reserve, the central bank of the U..."
120,2023-03-22,"The Federal Reserve, the central bank of the U..."
121,2023-05-03,"The Federal Reserve, the central bank of the U..."


### Pick date range for transformer model

For display purposes, the range below parses the first 4 documents for 2023.

In [9]:
start_date = '2023-01-01'
end_date = '2024-01-01'

In [10]:
filtered_df = (mins_dataset.loc[(mins_dataset['index'] >= start_date) & (mins_dataset['index'] < end_date)]).reset_index(drop=True)

In [11]:
#filtered_df = mins_dataset[87:137]
filtered_df

Unnamed: 0,index,Federal_Reserve_Mins
0,2023-02-01,"The Federal Reserve, the central bank of the U..."
1,2023-03-22,"The Federal Reserve, the central bank of the U..."
2,2023-05-03,"The Federal Reserve, the central bank of the U..."
3,2023-06-14,"The Federal Reserve, the central bank of the U..."


In [14]:
sentiment_vals = []

for i in range(0,len(filtered_df)):

    out = transformer_func(i)
    sentiment_vals.append(out)

Positive to Negative Sentiment ratio 0.5
Positive to Negative Sentiment ratio 0.5529411764705883
Positive to Negative Sentiment ratio 0.47863247863247865
Positive to Negative Sentiment ratio 0.8783783783783784


In [17]:

val_df= pd.DataFrame(filtered_df[filtered_df.columns[0]])
val_df['Score'] = sentiment_vals

print(len(sentiment_vals))
val_df
val_df.to_csv('Batch_transformer_files/sentiment_minutes_22_23_display.csv',index='True')

val_df

4


Unnamed: 0,index,Score
0,2023-02-01,0.5
1,2023-03-22,0.552941
2,2023-05-03,0.478632
3,2023-06-14,0.878378
