<a href="https://colab.research.google.com/github/NastasiaMazur/Finance-Sentiment-Analyses/blob/main/distilbert_base_uncased_FinanceInc_auditor_sentiment__UPDATED.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downstream Task: Sentiment Analysis

Model: **distilbert-base-uncased**

Dataset: **FinanceInc/auditor_sentiment**

# 1. Activate GPU and Install Dependencies

In [None]:
#check if GPU is available
import torch
torch.cuda.is_available()

False

In [None]:
# Install required libraries
!pip install datasets
!pip install transformers
!pip install bertviz transformers
!pip install transformers[torch]
#!apt-get install git-lfs

Collecting bertviz
  Using cached bertviz-1.4.0-py3-none-any.whl (157 kB)
Collecting boto3 (from bertviz)
  Using cached boto3-1.34.68-py3-none-any.whl (139 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0->bertviz)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0->bertviz)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0->bertviz)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0->bertviz)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.0->bertviz)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.0-

In [None]:
# Conntext Google Drive so data can be stored there
from google.colab import drive
drive.mount('/content/drive/')

# 2. Preprocess data

In [None]:
# Load data
from datasets import load_dataset
financial_dataset = load_dataset("FinanceInc/auditor_sentiment")

In [None]:
# Create a smaller training dataset for faster training times
from datasets import DatasetDict

small_financial_dataset = DatasetDict(
    train=financial_dataset['train'].shuffle(seed=24).select(range(500)), # for training
    val=financial_dataset['train'].shuffle(seed=24).select(range(500, 600)), #  for validation
    test=financial_dataset['train'].shuffle(seed=24).select(range(600, 700)) # for testing
)

In [None]:
small_financial_dataset

In [None]:
small_financial_dataset['train'][:5]

In [None]:
# Set DistilBERT tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer)
print(len(tokenizer))

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

small_tokenized_dataset = small_financial_dataset.map(tokenize_function, batched=True, batch_size=16)
small_tokenized_dataset = small_tokenized_dataset.remove_columns(["sentence"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

In [None]:
small_tokenized_dataset['train'][0:2]


In [None]:
import torch
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)

# 3. Training the model

In [None]:

from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import pipeline
from tqdm.notebook import tqdm

In [None]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

In [None]:
num_epochs = 3
num_training_steps = 3 * len(train_dataloader)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
#from datasets import load_dataset, DatasetDict              # Повторение!!!

small_financial_dataset = DatasetDict(
    train=financial_dataset['train'].shuffle(seed=24).select(range(500)), # for training
    val=financial_dataset['train'].shuffle(seed=24).select(range(500, 600)), #  for validation
    test=financial_dataset['train'].shuffle(seed=24).select(range(600, 700)) # for testing
)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

small_tokenized_dataset = small_financial_dataset.map(tokenize_function, batched=True, batch_size=16)

Uncomment if you want to log in to your Hugging Face account:

In [None]:
# Log in to your Hugging Face account
# Get your API token here https://huggingface.co/settings/token
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)

arguments = TrainingArguments(
    output_dir="store_the_checkpoints_distilbert_3",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=10,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    lr_scheduler_type='cosine',
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    seed=224
    #push_to_hub=True, # uncomment to push into hugginhface hub
)


early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0
)


optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)


def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'], # change to test when you do your final evaluation!
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None)
)

In [None]:
# Train the model
trainer.train()

In [None]:
results = trainer.predict(small_tokenized_dataset['val'])
print(results)

In [None]:
test_str = "The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_distilbert_3/checkpoint-320") # pass checkpoint to the model
model_inputs = tokenizer(test_str, return_tensors="pt")

prediction = torch.argmax(fine_tuned_model(**model_inputs).logits)
print(["NEGATIVE", "POSITIVE", "NEUTRAL"][prediction])

In [None]:
results = trainer.predict(small_tokenized_dataset['test'])
print(results)

In [None]:
small_financial_dataset['test'][5]

In [None]:
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("store_the_checkpoints_distilbert_2/checkpoint-320")

model_inputs = tokenizer(small_tokenized_dataset['test']['sentence'], padding=True, truncation=True, return_tensors='pt')

outputs = fine_tuned_model(**model_inputs, output_hidden_states=True)

In [None]:
len(outputs['hidden_states'])

In [None]:
import os
import torch
from torch.utils.tensorboard import SummaryWriter

path = "/content/drive/MyDrive/results_vis_distilbert_2"
layer = 0

if not os.path.exists(path):
    os.mkdir(path)

while layer in range(len(outputs['hidden_states'])):
    if not os.path.exists(path+'/layer_' + str(layer)):
        os.mkdir(path+'/layer_' + str(layer))

    tensors = []
    labels = []

    for example in range(len(outputs['hidden_states'][layer])):
        sp_token_position = 0
        for token in model_inputs['input_ids'][example]:
            if token != 0:
                sp_token_position += 1
            else:
                tensor = outputs['hidden_states'][layer][example][sp_token_position]
                tensors.append(tensor)
                label = [small_tokenized_dataset['test']['sentence'][example], str(small_tokenized_dataset['test']['label'][example])]
                labels.append(label)
                break

    #print("Length of tensors:", len(tensors))
    #print("Length of labels:", len(labels))

    global_step = layer  # Set global_step to the current layer number
    writer = SummaryWriter(path+'/layer_' + str(layer))
    writer.add_embedding(torch.stack(tensors), metadata=labels, metadata_header=['Sentence','Emotion'], global_step=global_step)

    layer += 1


____________________



To avoid AssertionError:(#labels should equal with #data points) and ensure that labels are extracted consistently with the corresponding tensors ensure that you're extracting labels consistently with how you're extracting tensors.

To avoid a warning: (Embedding dir exists, did you set global_step for add_embedding()? ) add "global_step = layer"

# 4. Analyzing new data with the model

You can also use pipeline (uncomment a cell with huggingface above):

In [None]:
# Upload the model to the Hub
trainer.push_to_hub()

In [None]:
# Run inferences with your new model using Pipeline
from transformers import pipeline

sentiment_model = pipeline(model="") # Add your saved mode in ""

sentiment_model(["The company's strong quarterly earnings report resulted in a surge in stock prices, reflecting investor confidence in its future prospects."])