In [1]:
import torch
from torch import cuda
torch.cuda.empty_cache()
current_mem = torch.cuda.memory_allocated() / (1024 * 1024)  # Convert bytes to megabytes

# Peak memory usage on GPU
peak_mem = torch.cuda.max_memory_allocated() / (1024 * 1024)  # Convert bytes to megabytes

print(f"Current GPU memory usage: {current_mem:.2f} MB")
print(f"Peak GPU memory usage: {peak_mem:.2f} MB")

Current GPU memory usage: 0.00 MB
Peak GPU memory usage: 0.00 MB


In [2]:
import matplotlib.pyplot as plt
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
import importlib

pd.options.mode.chained_assignment = None



In [3]:
from datasets import load_dataset
dataset = load_dataset("financial_phrasebank", 'sentences_allagree')
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
#nltk.download('all')

In [6]:
sentences = dataset['train']['sentence']
labels = dataset['train']['label']

In [7]:
train = pd.DataFrame()
train['Sentences'] = sentences
train['Labels'] = labels
train['Sentences'] = train['Sentences'].astype(str)
train.dtypes

Sentences    object
Labels        int64
dtype: object

In [8]:
from preprocess import preprocess_df

In [9]:
train['Sentences'] = train['Sentences'].apply(preprocess_df)
train

Unnamed: 0,Sentences,Labels
0,according gran company plan move production ru...,1
1,last quarter componenta net sale doubled eur e...,2
2,third quarter net sale increased eur mn operat...,2
3,operating profit rose eur mn eur mn correspond...,2
4,operating profit totalled eur mn eur mn repres...,2
...,...,...
2259,operating result month period decreased profit...,0
2260,helsinki thomson financial share cargotec fell...,0
2261,london marketwatch share price ended lower lon...,0
2262,operating profit fell eur mn eur mn including ...,0


In [10]:
train_df, test_df = train_test_split(train, test_size=0.3, random_state=42)
train_labels = torch.tensor(np.array(train_df['Labels']))
test_labels = torch.tensor(np.array(test_df['Labels']))

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
max_length = 512  # Set the maximum sequence length
batch_size = 16
num_epochs = 4

In [13]:
def tokenize_data(data):
    tokenized = tokenizer.batch_encode_plus(
        data,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokenized


In [14]:
train_tokenized = tokenize_data(train_df['Sentences'].tolist())
test_tokenized = tokenize_data(test_df['Sentences'].tolist())

train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [15]:
print("Train input_ids size:", train_tokenized['input_ids'].size())
print("Train attention_mask size:", train_tokenized['attention_mask'].size())
print("Train labels size:", train_labels.size())

print("Test input_ids size:", test_tokenized['input_ids'].size())
print("Test attention_mask size:", test_tokenized['attention_mask'].size())
print("Test labels size:", test_labels.size())


Train input_ids size: torch.Size([1584, 512])
Train attention_mask size: torch.Size([1584, 512])
Train labels size: torch.Size([1584])
Test input_ids size: torch.Size([680, 512])
Test attention_mask size: torch.Size([680, 512])
Test labels size: torch.Size([680])


In [16]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    model.train()
    running_train_loss = 0.0
    correct_train_predictions = 0
    total_train_predictions = 0

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training", unit="batch")):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels.unsqueeze(1))
        loss = outputs.loss
        running_train_loss += loss.item()

        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.logits, 1)
        total_train_predictions += labels.size(0)
        correct_train_predictions += (predicted == labels).sum().item()

    # Calculate train accuracy and loss for the epoch
    train_epoch_loss = running_train_loss / len(train_loader)
    train_epoch_accuracy = (correct_train_predictions / total_train_predictions) * 100
    print(f"Train Loss: {train_epoch_loss:.4f} - Train Accuracy: {train_epoch_accuracy:.2f}%")

    # Validation (testing) after each epoch
    model.eval()
    running_test_loss = 0.0
    correct_test_predictions = 0
    total_test_predictions = 0

    with torch.no_grad():
        for test_step, test_batch in enumerate(tqdm(test_loader, desc=f"Epoch {epoch + 1} - Testing", unit="batch")):
            test_input_ids, test_attention_mask, test_labels = test_batch
            test_input_ids = test_input_ids.to(device)
            test_attention_mask = test_attention_mask.to(device)
            test_labels = test_labels.to(device)

            test_outputs = model(test_input_ids, attention_mask=test_attention_mask, labels=test_labels.unsqueeze(1))
            test_loss = test_outputs.loss
            running_test_loss += test_loss.item()

            _, test_predicted = torch.max(test_outputs.logits, 1)
            total_test_predictions += test_labels.size(0)
            correct_test_predictions += (test_predicted == test_labels).sum().item()

    # Calculate test accuracy and loss for the epoch
    test_epoch_loss = running_test_loss / len(test_loader)
    test_epoch_accuracy = (correct_test_predictions / total_test_predictions) * 100
    print(f"Test Loss: {test_epoch_loss:.4f} - Test Accuracy: {test_epoch_accuracy:.2f}%")

print("Training finished.")

Epoch 1/4


Epoch 1 - Training: 100%|██████████| 99/99 [00:44<00:00,  2.24batch/s]


Train Loss: 0.6338 - Train Accuracy: 72.85%


Epoch 1 - Testing: 100%|██████████| 43/43 [00:06<00:00,  6.97batch/s]


Test Loss: 0.3511 - Test Accuracy: 87.06%
Epoch 2/4


Epoch 2 - Training: 100%|██████████| 99/99 [00:43<00:00,  2.28batch/s]


Train Loss: 0.2567 - Train Accuracy: 90.34%


Epoch 2 - Testing: 100%|██████████| 43/43 [00:06<00:00,  6.93batch/s]


Test Loss: 0.2295 - Test Accuracy: 91.32%
Epoch 3/4


Epoch 3 - Training: 100%|██████████| 99/99 [00:43<00:00,  2.27batch/s]


Train Loss: 0.1423 - Train Accuracy: 94.63%


Epoch 3 - Testing: 100%|██████████| 43/43 [00:06<00:00,  6.90batch/s]


Test Loss: 0.2810 - Test Accuracy: 90.44%
Epoch 4/4


Epoch 4 - Training: 100%|██████████| 99/99 [00:43<00:00,  2.26batch/s]


Train Loss: 0.1158 - Train Accuracy: 95.83%


Epoch 4 - Testing: 100%|██████████| 43/43 [00:06<00:00,  6.85batch/s]

Test Loss: 0.2711 - Test Accuracy: 90.44%
Training finished.





In [17]:
train.groupby('Labels').describe()

Unnamed: 0_level_0,Sentences,Sentences,Sentences,Sentences
Unnamed: 0_level_1,count,unique,top,freq
Labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,303,297,operating profit fell eur mn eur mn,3
1,1391,1370,value order eur mn,7
2,570,546,operating profit rose eur mn eur mn correspond...,6


As can be seen, theres a class imblance between the labels, and even though the base bert model is performing pretty good on the test set with an accuracy of 90.44% on the final epoch, it is still overfitting on the train data with the accuracy being 95.83% on the final epoch. One possible solution to this is to introduce augmentations in the data to reduce the class imbalance and boost the model performance on the test set.
