# Model implementation

### Setup

In [1]:
import os
import glob
import pandas as pd
import numpy as np

In [2]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

Mounted at /content/drive


In [3]:
path_aud = os.getcwd()+'/drive/MyDrive/CentralBankBERTa/Audience_classification'
path_sent = os.getcwd()+'/drive/MyDrive/CentralBankBERTa/Sentiment_classification'

#### Install Huggingface Library

In [4]:
#Stable version
!pip install transformers[torch] accelerate>=0.20.1

In [6]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

There are 1 GPU(s) available.
Device name: NVIDIA A100-SXM4-40GB


## Import Functions

In [7]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import default_data_collator

# Create a Dataset object from the encodings
class SentimentDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

In [8]:
from scipy.special import softmax

def predict_sentiment(model, tokenizer, X_test):
    test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)
    y_test = [1] * len(list(X_test))
    test_dataset = SentimentDataset(test_encodings, y_test)

    # Set up the data collator to avoid padding the labels
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Set up the trainer with the test data
    trainer = Trainer(model=model, data_collator=data_collator)

    # Make predictions on the test set
    predictions = trainer.predict(test_dataset=test_dataset).predictions.argmax(axis=1)

    # Make predictions on the test set
    predictions_prob = trainer.predict(test_dataset=test_dataset).predictions

    # Return the true and predicted labels
    return predictions, predictions_prob

### Load Audience Classifier

In [9]:
def load_model_from_checkpoint(checkpoint, which_model):
    # initialize tokenizer and model class
    tokenizer = RobertaTokenizer.from_pretrained(which_model)
    model = RobertaForSequenceClassification.from_pretrained(checkpoint)
    # Return the loaded model and tokenizer
    return model, tokenizer

In [10]:
checkpoint = glob.glob(path_aud+'/result/'+'*RoBERTa*/*checkpoint*')[0]

In [11]:
model_from_memory, tokenizer_from_memory = load_model_from_checkpoint(checkpoint, 'roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [12]:
label_dict =  {'households': 0, 'firms': 1, 'financial sector': 2, 'government': 3, 'central bank': 4}
sentence = [input('Enter sentence: ')]
res = predict_sentiment(model_from_memory, tokenizer_from_memory, np.array(sentence))
aud = [k for k, v in label_dict.items() if v == res[0][0]]
print("This sentence addresses the {}".format(aud[0]))

Enter sentence: We used our liquidity tools to make funding available to banks that might need it. 


This sentence addresses the financial sector


### Load Sentiment Analysis

In [None]:
checkpoint = glob.glob(path_sent+'/result/'+'*RoBERTa*/*checkpoint*')[0]

In [None]:
model_from_memory, tokenizer_from_memory = load_model_from_checkpoint(checkpoint, 'roberta-base')

In [None]:
res = predict_sentiment(model_from_memory, tokenizer_from_memory, np.array(sentence))
print("The Sentiment is {}".format(res[0][0]))
print("Probability of Negative Sentiment is {}".format(res[1][0][0]))
print("Probability of Positive Sentiment is {}".format(res[1][0][1]))

The Sentiment is 1
Probability of Negative Sentiment is -0.4553529620170593
Probability of Positive Sentiment is 0.7679672837257385


# Stack models

In [None]:
checkpoint_audience = glob.glob(path_aud+'/result/'+'*RoBERTa*/*checkpoint-500*')[0]
audience_model_from_memory, audience_tokenizer_from_memory = load_model_from_checkpoint(checkpoint_audience, 'roberta-base')

checkpoint_sentiment = glob.glob(path_sent+'/result/'+'*RoBERTa*/*checkpoint*')[0]
sentiment_model_from_memory, sentiment_tokenizer_from_memory = load_model_from_checkpoint(checkpoint_sentiment, 'roberta-base')

In [None]:
import tensorflow as tf

def find_sentiment_audience(sentences):

    if type(sentences) is list:
        sentences_df = pd.DataFrame()
        sentences_df['text'] = sentences
    else:
        sentence_df = sentences

    predicted_audience = predict_sentiment(audience_model_from_memory, audience_tokenizer_from_memory, sentences_df["text"].values.tolist())
    sentences_df['Predicted Audience'] = predicted_audience[0]
    sentences_df['Audience (Probabilities)'] = None
    sentences_df['Audience (Probabilities)'] = sentences_df['Audience (Probabilities)'].astype('object')
    sentences_df['Audience (Probabilities)'] =  [predicted_audience[1][i, :] for i in range(len(sentences_df))]
    sentences_df['Audience (Probabilities)'] = sentences_df['Audience (Probabilities)'].apply(lambda x: tf.nn.softmax(x))


    predicted_values = predict_sentiment(sentiment_model_from_memory, sentiment_tokenizer_from_memory, sentences_df["text"].values.tolist())
    sentences_df['Predicted Sentiment'] = predicted_values[0]
    sentences_df['Sentiment (Probabilities)'] = None
    sentences_df['Sentiment (Probabilities)'] = sentences_df['Sentiment (Probabilities)'].astype('object')
    sentences_df['Sentiment (Probabilities)'] =  [predicted_values[1][i, :] for i in range(len(sentences_df))]
    sentences_df['Sentiment (Probabilities)'] = sentences_df['Sentiment (Probabilities)'].apply(lambda x: tf.nn.softmax(x))

    return sentences_df