In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset
from preprocess import preprocess_df

In [2]:
# making sure that the model trains on the GPU and not on the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
train_data = load_dataset('financial_phrasebank','sentences_allagree')

In [4]:
df = pd.DataFrame()
df['Sentences'] = train_data['train']['sentence']
df['Labels'] = train_data['train']['label']
df = df.dropna()
df = df.drop_duplicates()

In [5]:
df.groupby('Labels').describe()

Unnamed: 0_level_0,Sentences,Sentences,Sentences,Sentences
Unnamed: 0_level_1,count,unique,top,freq
Labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,303,303,Jan. 6 -- Ford is struggling in the face of sl...,1
1,1386,1386,"According to Gran , the company has no plans t...",1
2,570,570,"For the last quarter of 2010 , Componenta 's n...",1


UNDERSAMPLING THE CLASSES TO THE LENGTH OF THE SMALLEST CLASS

In [6]:
p = df.groupby('Labels')
p = p.apply(lambda x: x.sample(p.size().min()).reset_index(drop=True))
p = p.reset_index(drop = True)
df = p
print(len(df))

909


In [7]:
df.groupby('Labels').describe()

Unnamed: 0_level_0,Sentences,Sentences,Sentences,Sentences
Unnamed: 0_level_1,count,unique,top,freq
Labels,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,303,303,The situation of coated magazine printing pape...,1
1,303,303,Huhtamaki 's rigid plastic consumer goods oper...,1
2,303,303,Talentum expects that the net sales of its cor...,1


In [8]:
df['Sentences'] = df['Sentences'].apply(preprocess_df)

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def fitBERT(dataFrame):
    max_length = 512  
    batch_size = 16
    num_epochs = 10
    df = dataFrame
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_labels = torch.tensor(np.array(train_df['Labels']))
    test_labels = torch.tensor(np.array(test_df['Labels']))
    def tokenize_data(data):
        tokenized = tokenizer.batch_encode_plus(
            data,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return tokenized
    train_tokenized = tokenize_data(train_df['Sentences'].tolist())
    test_tokenized = tokenize_data(test_df['Sentences'].tolist())
    
    train_dataset = TensorDataset(train_tokenized['input_ids'], train_tokenized['attention_mask'], train_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    test_dataset = TensorDataset(test_tokenized['input_ids'], test_tokenized['attention_mask'], test_labels)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        model.train()
        running_train_loss = 0.0
        correct_train_predictions = 0
        total_train_predictions = 0
        for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1} - Training", unit="batch")):
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
    
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels.unsqueeze(1))
            loss = outputs.loss
            running_train_loss += loss.item()
    
            loss.backward()
            optimizer.step()
    
            _, predicted = torch.max(outputs.logits, 1)
            total_train_predictions += labels.size(0)
            correct_train_predictions += (predicted == labels).sum().item()
        

        train_epoch_loss = running_train_loss / len(train_loader)
        train_epoch_accuracy = (correct_train_predictions / total_train_predictions) * 100
        print(f"Train Loss: {train_epoch_loss:.4f} - Train Accuracy: {train_epoch_accuracy:.2f}%")
    
    
        model.eval()
        running_test_loss = 0.0
        correct_test_predictions = 0
        total_test_predictions = 0
        
        with torch.no_grad():
            for test_step, test_batch in enumerate(tqdm(test_loader, desc=f"Epoch {epoch + 1} - Testing", unit="batch")):
                test_input_ids, test_attention_mask, test_labels = test_batch
                test_input_ids = test_input_ids.to(device)
                test_attention_mask = test_attention_mask.to(device)
                test_labels = test_labels.to(device)
    
                test_outputs = model(test_input_ids, attention_mask=test_attention_mask, labels=test_labels.unsqueeze(1))
                test_loss = test_outputs.loss
                running_test_loss += test_loss.item()
    
                _, test_predicted = torch.max(test_outputs.logits, 1)
                total_test_predictions += test_labels.size(0)
                correct_test_predictions += (test_predicted == test_labels).sum().item()
                
        # Calculate test accuracy and loss for the epoch
        test_epoch_loss = running_test_loss / len(test_loader)
        test_epoch_accuracy = (correct_test_predictions / total_test_predictions) * 100
        print(f"Test Loss: {test_epoch_loss:.4f} - Test Accuracy: {test_epoch_accuracy:.2f}%")
    
    print("Training finished.")
    return model
    
    

In [11]:
model = fitBERT(df)

Epoch 1/10


Epoch 1 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.21batch/s]


Train Loss: 1.0347 - Train Accuracy: 42.92%


Epoch 1 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.25batch/s]


Test Loss: 0.9113 - Test Accuracy: 50.55%
Epoch 2/10


Epoch 2 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.30batch/s]


Train Loss: 0.7894 - Train Accuracy: 61.35%


Epoch 2 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.18batch/s]


Test Loss: 0.7266 - Test Accuracy: 73.63%
Epoch 3/10


Epoch 3 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.29batch/s]


Train Loss: 0.5873 - Train Accuracy: 77.44%


Epoch 3 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.13batch/s]


Test Loss: 0.5682 - Test Accuracy: 80.22%
Epoch 4/10


Epoch 4 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.29batch/s]


Train Loss: 0.4232 - Train Accuracy: 85.69%


Epoch 4 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.13batch/s]


Test Loss: 0.4893 - Test Accuracy: 81.87%
Epoch 5/10


Epoch 5 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.29batch/s]


Train Loss: 0.2803 - Train Accuracy: 92.16%


Epoch 5 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.12batch/s]


Test Loss: 0.4050 - Test Accuracy: 85.71%
Epoch 6/10


Epoch 6 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.29batch/s]


Train Loss: 0.2034 - Train Accuracy: 93.40%


Epoch 6 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.11batch/s]


Test Loss: 0.3996 - Test Accuracy: 85.71%
Epoch 7/10


Epoch 7 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.29batch/s]


Train Loss: 0.1515 - Train Accuracy: 95.46%


Epoch 7 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.11batch/s]


Test Loss: 0.4107 - Test Accuracy: 85.71%
Epoch 8/10


Epoch 8 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.28batch/s]


Train Loss: 0.1156 - Train Accuracy: 96.84%


Epoch 8 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.08batch/s]


Test Loss: 0.4206 - Test Accuracy: 85.16%
Epoch 9/10


Epoch 9 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.28batch/s]


Train Loss: 0.0837 - Train Accuracy: 98.21%


Epoch 9 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.07batch/s]


Test Loss: 0.4653 - Test Accuracy: 84.62%
Epoch 10/10


Epoch 10 - Training: 100%|██████████| 46/46 [00:20<00:00,  2.28batch/s]


Train Loss: 0.0863 - Train Accuracy: 97.25%


Epoch 10 - Testing: 100%|██████████| 12/12 [00:01<00:00,  7.07batch/s]

Test Loss: 0.5336 - Test Accuracy: 82.97%
Training finished.



