<a href="https://colab.research.google.com/github/NastasiaMazur/Finance-Sentiment-Analysis/blob/main/distilroberta_base_FinanceIncauditor_sentiment_UPDATED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downstream Task: Sentiment Analysis

Model: **distilroberta-base**

Dataset: **FinanceInc/auditor_sentiment**

# 1. Activate GPU and Install Dependencies

In [1]:
#check if GPU is available
import torch
torch.cuda.is_available()

False

In [None]:
# Install required libraries
!pip install transformers
!pip install datasets
!pip install bertviz transformers
!pip install transformers[torch]



In [None]:
# Conntext Google Drive to store data
from google.colab import drive
drive.mount('/content/drive/')

# 2. Preprocess data

In [None]:
# Load data
from datasets import load_dataset
finance_dataset = load_dataset("FinanceInc/auditor_sentiment")

# Create a smaller training dataset for faster training times
from datasets import DatasetDict

small_finance_dataset = DatasetDict(
    train=finance_dataset['train'].shuffle(seed=24).select(range(500)), # for training
    val=finance_dataset['train'].shuffle(seed=24).select(range(500, 600)), #  for validation
    test=finance_dataset['train'].shuffle(seed=24).select(range(600, 700)) # for testing
)

In [None]:
small_finance_dataset

In [None]:
small_finance_dataset['train'][:5]

In [None]:
# Set Distilroberta-base tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
print(tokenizer)
print(len(tokenizer))

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

small_tokenized_dataset = small_finance_dataset.map(tokenize_function, batched=True, batch_size=16)
small_tokenized_dataset = small_tokenized_dataset.remove_columns(["sentence"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

In [None]:
small_tokenized_dataset['train'][0:3]


In [None]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

# 3. Training the model

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [None]:
# Define DistilROBERTA as our base model:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=3)

In [None]:
num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Uncomment if you want to log in to your Hugging Face account:

In [None]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
import numpy as np
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)

arguments = TrainingArguments(
    output_dir="store_the_checkpoints_distilroberta_3",                           #ADJUST FOLDER !!!
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    lr_scheduler_type='cosine',
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    seed=224
)


early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)


optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

In [None]:
trainer.train()

In [None]:
results = trainer.predict(small_tokenized_dataset['val'])
print(results)

In [None]:
test_str = "The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_distilroberta_3/checkpoint-320") #ADJUST FOLDER !!!  500/600/700-folder320
model_inputs = tokenizer(test_str, return_tensors="pt")
prediction = torch.argmax(fine_tuned_model(**model_inputs).logits)
print(["NEGATIVE", "POSITIVE", "NEUTRAL"][prediction])

In [None]:
results = trainer.predict(small_tokenized_dataset['test'])
print(results)

In [None]:
small_finance_dataset['test'][5]

In [None]:
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_distilroberta_3/checkpoint-320") #ADJUST FOLDER !!!  500/600/700-folder320

model_inputs = tokenizer(small_tokenized_dataset['test']['sentence'], padding=True, truncation=True, return_tensors='pt')
outputs = fine_tuned_model(**model_inputs, output_hidden_states=True)

In [None]:
from torch.utils.tensorboard import SummaryWriter
import os
import re
import torch
import tensorflow as tf
import tensorboard as tb

In [None]:
len(outputs['hidden_states'])

In [None]:
import torch

path = "store_the_checkpoints_distilroberta_3/results_vis_distilroberta_2"
layer=0
if not os.path.exists(path):
  os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
  if not os.path.exists(path+'/layer_' + str(layer)):
    os.mkdir(path+'/layer_' + str(layer))

  example = 0
  tensors = []
  labels = []

  while example in range(len(outputs['hidden_states'][layer])):
    sp_token_position = 0
    for token in model_inputs['input_ids'][example]:
      if token != 0:
        sp_token_position += 1
      else:
        tensor = outputs['hidden_states'][layer][example][sp_token_position]
        tensors.append(tensor)
        break

    label = [small_tokenized_dataset['test']['sentence'][example],str(small_tokenized_dataset['test']['label'][example])]
    labels.append(label)
    example +=1

  writer=SummaryWriter(path+'/layer_' + str(layer))
  writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Sentence','Emotion'])

  layer+=1


 TensorFlow Embedding Projector [API](https://projector.tensorflow.org/).

# 4. Analyzing new data with the model

You can also use pipeline (uncomment a cell with huggingface above):

In [None]:
# Upload the model to the Hub
trainer.push_to_hub()

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="") # Add your saved mode in ""

sentiment_model(["The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."])