In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm

BERT model imports

In [None]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained(
    'ProsusAI/finbert',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset loading and preparation

In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Datasets/news_data_labelled.csv', encoding='utf-8')
df['combined_text'] = df['Title'] + " " + df['Text']
df.dropna(inplace=True)
# Map sentiment labels to integers
label_dict = {'positive': 0, 'negative': 1, 'neutral': 2}
# Convert sentiment labels in the DataFrame to integers
df['Sentiment'] = df['Sentiment'].map(label_dict)

Stratified sampling 10% of the dataset of hyperparameter tuning

In [None]:
df_sample, _ = train_test_split(df, test_size=0.9, stratify=df['Sentiment'], random_state=42)

Data preprocessing and attention mask creation

In [None]:
input_ids = []
attention_masks = []
labels = df_sample['Sentiment'].values

for text in df_sample['combined_text']:
    encoded_dict = tokenizer.encode_plus(
        text,                      # Text to encode
        add_special_tokens=True,   # Add '[CLS]' and '[SEP]'
        max_length=64,             # Pad & truncate all sentences
        padding='max_length',      # Pad to max length
        return_attention_mask=True,# Construct attention masks
        return_tensors='pt',       # Return PyTorch tensors
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])



Dataloader creation

In [None]:
# Define a function for tokenization and data preparation
def preprocess_for_bert(data, max_len=512):
    """Preprocesses the given data for BERT model training."""
    # Tokenize the text; this will also add '[CLS]' and '[SEP]' tokens
    input_ids = []
    attention_masks = []

    for sentence in data:
        encoded_sent = tokenizer.encode_plus(
            text=sentence,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=max_len,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            return_attention_mask=True,      # Return attention mask
            truncation=True
        )

        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

# Preprocess the sample dataset
input_ids, attention_masks = preprocess_for_bert(df_sample['combined_text'])

# Convert labels to tensor
labels = torch.tensor(df_sample['Sentiment'].values)

# Create the DataLoader
batch_size = 16



Hyperparameter selection with 10% of the dataset

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Split the sample dataset into training and validation
train_size = 0.8
train_dataset, val_dataset = train_test_split(df_sample, test_size=1-train_size, random_state=42, stratify=df_sample['Sentiment'])

# Preprocess the training and validation data
train_input_ids, train_attention_masks = preprocess_for_bert(train_dataset['combined_text'])
train_labels = torch.tensor(train_dataset['Sentiment'].values)
val_input_ids, val_attention_masks = preprocess_for_bert(val_dataset['combined_text'])
val_labels = torch.tensor(val_dataset['Sentiment'].values)

# Create DataLoaders for training and validation sets
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


# Hyperparameter ranges
learning_rates = [2e-5, 3e-5, 5e-5]
epsilons = [1e-8, 1e-6, 1e-4]

# Function to evaluate the model on the validation set
def evaluate_model(model, val_dataloader):
    model.eval()
    total_eval_accuracy = 0

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    return avg_val_accuracy

# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

best_hyperparameters = {'learning_rate': 2e-5, 'epsilon': 1e-8}  # Initialize with default values

for lr in learning_rates:
    for eps in epsilons:
        # Initialize model for each hyperparameter set
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=3,
            output_attentions=False,
            output_hidden_states=False,
        )
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=lr, eps=eps)

        for epoch in range(4):  # Example: 4 training epochs
            model.train()
            total_train_loss = 0

            for step, batch in enumerate(train_dataloader):
                b_input_ids, b_input_mask, b_labels = batch
                b_input_ids = b_input_ids.to(device)
                b_input_mask = b_input_mask.to(device)
                b_labels = b_labels.to(device)

                model.zero_grad()  # Reset gradients

                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs.loss
                total_train_loss += loss.item()
                loss.backward()  # Backpropagation
                optimizer.step()  # Update parameters

            avg_train_loss = total_train_loss / len(train_dataloader)
            val_accuracy = evaluate_model(model, val_dataloader)

            if val_accuracy > best_hyperparameters.get('accuracy', 0):
                best_hyperparameters.update({'learning_rate': lr, 'epsilon': eps, 'accuracy': val_accuracy})

print("Best Hyperparameters:", best_hyperparameters)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Loop with optimal parameters

In [None]:
# Full Training Loop with Optimal Parameter
model = BertForSequenceClassification.from_pretrained(
    'ProsusAI/finbert',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

optimizer = AdamW(model.parameters(),
                  lr=best_hyperparameters['learning_rate'],
                  eps=best_hyperparameters['epsilon'])

epochs = 3
for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)

        model.zero_grad()  # Reset gradients

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()  # Backpropagation
        optimizer.step()  # Update parameters

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} | Average training loss: {avg_train_loss}")


In [None]:
def analyze_sentiment(text):

    # Tokenize and predict
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits

    # Softmax to get probabilities
    probabilities = torch.softmax(logits, dim=1).squeeze()

    # Sentiment labels in the order of [negative, neutral, positive]
    sentiment_labels = ['negative', 'neutral', 'positive']

    # Getting the index of the max probability
    label_idx = torch.argmax(probabilities).item()

    # Continuous sentiment score calculation
    sentiment_score = -1 * probabilities[0] + 0 * probabilities[1] + 1 * probabilities[2]

    return sentiment_labels[label_idx], sentiment_score.item()


Remember to save model

In [None]:
model_save_path = "StocBERTmodel.pt"
torch.save(model.state_dict(), model_save_path)