<a href="https://colab.research.google.com/github/Moritz-Pfeifer/CentralBankRoBERTa/blob/main/Model_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model implementation

### Setup

In [None]:
import os
import glob
import pandas as pd
import numpy as np

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

Mounted at /content/drive


In [None]:
path_aud = os.getcwd()+'/drive/MyDrive/CentralBankBERTa/Audience_classification'
path_sent = os.getcwd()+'/drive/MyDrive/CentralBankBERTa/Sentiment_classification'

#### Install Huggingface Library

In [None]:
#Stable version
!pip install transformers[torch] accelerate>=0.20.1

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

There are 1 GPU(s) available.
Device name: NVIDIA A100-SXM4-40GB


## Import Functions

In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import default_data_collator

# Create a Dataset object from the encodings
class SentimentDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

In [None]:
from scipy.special import softmax

def predict_sentiment(model, tokenizer, X_test):
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)
    y_test = [1] * len(list(X_test))
    test_dataset = SentimentDataset(test_encodings, y_test)

    # Set up the data collator to avoid padding the labels
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Set up the trainer with the test data
    trainer = Trainer(model=model, data_collator=data_collator)

    # Make predictions on the test set
    predictions = trainer.predict(test_dataset=test_dataset).predictions.argmax(axis=1)

    # Make predictions on the test set
    predictions_prob = trainer.predict(test_dataset=test_dataset).predictions

    # Return the true and predicted labels
    return predictions, predictions_prob

### Load Audience Classifier

In [None]:
def load_model_from_checkpoint(checkpoint, which_model):
    # initialize tokenizer and model class
    tokenizer = RobertaTokenizer.from_pretrained(which_model)
    model = RobertaForSequenceClassification.from_pretrained(checkpoint)
    # Return the loaded model and tokenizer
    return model, tokenizer

In [None]:
checkpoint = glob.glob(path_aud+'/result/'+'*RoBERTa*/*checkpoint*')[0]

In [None]:
model_from_memory, tokenizer_from_memory = load_model_from_checkpoint(checkpoint, 'roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
label_dict =  {'households': 0, 'firms': 1, 'financial sector': 2, 'government': 3, 'central bank': 4}
sentence = [input('Enter sentence: ')]
res = predict_sentiment(model_from_memory, tokenizer_from_memory, np.array(sentence))
aud = [k for k, v in label_dict.items() if v == res[0][0]]
print("This sentence addresses the {}".format(aud[0]))

Enter sentence: They added that Yellen planned to discuss the three pillars of the US-China economic relationship that she outlined in a speech in April. 


This sentence addresses the firms


### Load Sentiment Analysis

In [None]:
checkpoint = glob.glob(path_sent+'/result/'+'*RoBERTa*/*checkpoint*')[0]

In [None]:
model_from_memory, tokenizer_from_memory = load_model_from_checkpoint(checkpoint, 'roberta-base')

In [None]:
res = predict_sentiment(model_from_memory, tokenizer_from_memory, np.array(sentence))
print("The Sentiment is {}".format(res[0][0]))
print("Probability of Negative Sentiment is {}".format(res[1][0][0]))
print("Probability of Positive Sentiment is {}".format(res[1][0][1]))

The Sentiment is 1
Probability of Negative Sentiment is -0.4553529620170593
Probability of Positive Sentiment is 0.7679672837257385


# Stack models

In [None]:
 checkpoint_audience = glob.glob(path_aud+'/result/'+'*RoBERTa*/*checkpoint-500*')[0]/Screenshot 2023-06-29 at 09.06.38.png
audience_model_from_memory, audience_tokenizer_from_memory = load_model_from_checkpoint(checkpoint_audience, 'roberta-base')

checkpoint_sentiment = glob.glob(path_sent+'/result/'+'*RoBERTa*/*checkpoint*')[0]
sentiment_model_from_memory, sentiment_tokenizer_from_memory = load_model_from_checkpoint(checkpoint_sentiment, 'roberta-base')

In [None]:
import tensorflow as tf

def find_sentiment_audience(sentences):

    if type(sentences) is list:
        sentences_df = pd.DataFrame()
        sentences_df['text'] = sentences
    else:
        sentence_df = sentences

    predicted_audience = predict_sentiment(audience_model_from_memory, audience_tokenizer_from_memory, sentences_df["text"].values.tolist())
    sentences_df['Predicted Audience'] = predicted_audience[0]
    sentences_df['Audience (Probabilities)'] = None
    sentences_df['Audience (Probabilities)'] = sentences_df['Audience (Probabilities)'].astype('object')
    sentences_df['Audience (Probabilities)'] =  [predicted_audience[1][i, :] for i in range(len(sentences_df))]
    sentences_df['Audience (Probabilities)'] = sentences_df['Audience (Probabilities)'].apply(lambda x: tf.nn.softmax(x))


    predicted_values = predict_sentiment(sentiment_model_from_memory, sentiment_tokenizer_from_memory, sentences_df["text"].values.tolist())
    sentences_df['Predicted Sentiment'] = predicted_values[0]
    sentences_df['Sentiment (Probabilities)'] = None
    sentences_df['Sentiment (Probabilities)'] = sentences_df['Sentiment (Probabilities)'].astype('object')
    sentences_df['Sentiment (Probabilities)'] =  [predicted_values[1][i, :] for i in range(len(sentences_df))]
    sentences_df['Sentiment (Probabilities)'] = sentences_df['Sentiment (Probabilities)'].apply(lambda x: tf.nn.softmax(x))

    return sentences_df

In [None]:
sentence = ["To address the shortcomings in the banking system, the Basel Committee has been pivotal in driving a series of regulatory reforms, most notably the Basel III framework, to strengthen banks' capital and liquidity requirements, and keep leverage at safer levels.", "To address the shortcomings in the banking system, the Basel Committee has been pivotal in driving a series of regulatory reforms, most notably the Basel III framework, to strengthen banks' capital and liquidity requirements, and keep leverage at safer levels."]
x = find_sentiment_audience(sentence)
x

In [None]:
x['Sentiment (Probabilities)'].iloc[1]

# Load Test Dataset

In [None]:
# Load FED Data
data_fed = pd.read_csv('/content/drive/MyDrive/CentralBankBERTa/Audience_classification/data_input/Pre_Processed_FED_Data.csv', engine='python', index_col=0)
data_fed

In [None]:
### This was only here for labelling purposes on June 15

# Determine what data you want to label
input_list = data_fed["Parsed_Text"].values.tolist()
# Convert all elements into strings so that we may label:
converted_list = [str(element) for element in input_list]
# Splitting the list so we can do it in batches

part_size = len(converted_list) // 10  # Calculate the size of each part

split_list = [converted_list[i:i+part_size] for i in range(0, len(converted_list), part_size)]

# Print each part separately
for i, part in enumerate(split_list):
    print(f"Part {i+1}: {part}")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
results_fed = find_sentiment_audience(converted_list)

In [None]:
# Intermediate save:
results_fed.to_csv('/content/drive/MyDrive/CentralBankBERTa/Model_loader/Output/Fed_labelled.csv')

In [None]:
# concat dataframs
data_all = pd.concat([results_fed, data_fed], axis=1)

In [None]:
results_fed.to_csv('/content/drive/MyDrive/CentralBankBERTa/Model_loader/Output/Fed_labelled.csv')