## Fine-tune FinBERT to make predictions based on specific train and validation sets

In [1]:
!nvcc --version
!nvidia-smi


'nvcc' is not recognized as an internal or external command,
operable program or batch file.


Wed Jul 16 13:01:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.86                 Driver Version: 551.86         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   46C    P8              3W /   75W |       0MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
print(torch.cuda.is_available())         # True if CUDA is usable
print(torch.cuda.get_device_name(0))     # Get GPU name
print(torch.cuda.current_device())       # Current device ID


True
NVIDIA GeForce RTX 3050 Laptop GPU
0


In [None]:
import os
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
import torch
import time
from tqdm import tqdm

class SequenceClassificationDataset(Dataset): # Handle the input data and labels for PyTorch's DataLoader
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs['input_ids']) # Return the total number of samples in the dataset

    def __getitem__(self, idx):
        # Retrieve the input_ids, attention_mask, and label corresponding to the index
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        label = self.labels[idx]
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

class FinBertFineTuning:
    def __init__(self, dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer='AdamW', device='cpu'):
        self.dataset_path = dataset_path
        self.train_file = train_file
        self.validation_file = validation_file
        self.feature_col = feature_col
        self.label_col = label_col
        self.model_name = model_name
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.max_len = max_len
        self.optimizer = optimizer
        self.device = torch.device(device)  # Convert device argument to torch.device

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, max_len=self.max_len)

        # Load datasets
        self.train_df = pd.read_csv(os.path.join(self.dataset_path, self.train_file))
        self.validation_df = pd.read_csv(os.path.join(self.dataset_path, self.validation_file))

        # Calculate number of unique labels
        self.num_labels = len(self.train_df[self.label_col].unique())

        # Tokenize datasets
        self.tokenized_train = self.tokenize_dataset(self.train_df, self.feature_col, self.label_col)
        self.tokenized_validation = self.tokenize_dataset(self.validation_df, self.feature_col, self.label_col)

        # Model configuration
        self.model_config = AutoConfig.from_pretrained(self.model_name, num_labels=self.num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, config=self.model_config)
        self.model.to(self.device)

        # Optimizer
        if self.optimizer is None:
            raise ValueError("Please provide an optimizer instance.")

        if self.optimizer == 'Adam':
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        elif self.optimizer == 'AdamW':
            self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)

        # DataLoaders
        self.train_dataloader = self.create_dataloader(self.tokenized_train)
        self.validation_dataloader = self.create_dataloader(self.tokenized_validation, shuffle=False)

    def tokenize_dataset(self, df, feature_col, label_col):
        return self.tokenizer(list(df[feature_col]),
                              padding=True,
                              truncation=True,
                              return_tensors='pt'), list(df[label_col])

    def create_dataloader(self, tokenized_dataset, shuffle=True):
        dataset = SequenceClassificationDataset(tokenized_dataset[0], tokenized_dataset[1])
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=shuffle)

    def evaluate_model(self, dataloader):
        self.model.eval() # Set the model to evaluation mode
        # Initialize lists to store true labels and predictions
        all_labels = []
        all_predictions = []

        with torch.no_grad():
            for batch in dataloader: # Iterate over batches in the data loader
                inputs = {key: value.to(self.device) for key, value in batch.items()} # Move inputs to the appropriate device (CPU or GPU)
                labels = inputs["labels"] # Extract labels from inputs
                outputs = self.model(**inputs) # Forward pass through the model
                logits = outputs.logits # Get logits from the model output

                _, predicted = torch.max(logits, 1) # Compute predicted labels
                # Convert labels and predictions to numpy arrays
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        accuracy = accuracy_score(all_labels, all_predictions) # Calculate accuracy
        return accuracy

    def train(self):
        for epoch in range(self.num_epochs): # Iterate over the num_epochs of epochs
            self.model.train() # Set the model to training mode
            train_losses = [] # List to store training losses for each batch

            # Iterate over batches in the training data loader, displaying progress using tqdm
            for batch in tqdm(self.train_dataloader, desc=f'Epoch {epoch + 1}/{self.num_epochs}'):
                inputs = {key: value.to(self.device) for key, value in batch.items()} # Move inputs to the appropriate device (CPU or GPU)
                outputs = self.model(**inputs) # Forward pass through the model
                loss = outputs.loss # Retrieve the loss from the model output
                train_losses.append(loss.item()) # Append the loss value to the list of training losses

                self.optimizer.zero_grad() # Zero the gradients
                loss.backward() # Backpropagate the gradients
                self.optimizer.step() # Update the model parameters

            # Validation
            validation_losses = [] # Initialize an empty list to store validation losses
            validation_accuracy = self.evaluate_model(self.validation_dataloader) # Evaluate model performance on the validation data loader

            for batch in self.validation_dataloader:
              inputs = {key: value.to(self.device) for key, value in batch.items()} # Move inputs to the appropriate device (CPU or GPU)
              outputs = self.model(**inputs) # Forward pass through the model
              loss = outputs.loss # Retrieve the loss from the model output
              validation_losses.append(loss.item()) # Append the loss value to the list of validation losses

            print(f'Epoch {epoch + 1}/{self.num_epochs} - Training Loss: {sum(train_losses) / len(train_losses):.4f} - Validation Loss: {sum(validation_losses) / len(validation_losses):.4f} - Validation Accuracy: {validation_accuracy:.4f}')

    def save_model(self, directory):
        self.model.save_pretrained(directory)
        self.tokenizer.save_pretrained(directory)

# Usage
start_time = time.time()
model = 'finbert'
model_name = 'ProsusAI/finbert'

## Hyperparameters
learning_rate = 2e-5
num_epochs = 3
batch_size = 6

# Maximum sequence length for padding and truncation
max_len = 512

optimizer = 'Adam'  # Adam or AdamW
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Paths and filenames
absolute_path = "D:/Berkas/Code/CryptoNew/"
dataset_path = absolute_path + "Datasets/"
train_file = 'train_set.csv'
validation_file = 'validation_set.csv'
feature_col = 'text'
label_col = 'sentiment_numerical_fin'
trained_model = model + '_optimizer_' + optimizer + '_lr_' + str(learning_rate) + '_epochs_' + str(
    num_epochs) + '_bs_' + str(batch_size) + '_maxlen_' + str(max_len)

# Fine-Tuning Phase
classifier = FinBertFineTuning(dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size,
                             learning_rate, num_epochs, max_len, optimizer, device)
classifier.train()
classifier.save_model(absolute_path + 'TrainedModels/' + trained_model)
print("Training time: {:.2f} seconds".format(time.time() - start_time))

ValueError: Due to a serious vulnerability issue in `torch.load`, even with `weights_only=True`, we now require users to upgrade torch to at least v2.6 in order to use the function. This version restriction does not apply when loading files with safetensors.
See the vulnerability report here https://nvd.nist.gov/vuln/detail/CVE-2025-32434

## Use the Fine-tuned FinBERT model to make predictions for a specific test set

In [None]:
import os
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

absolute_path = "D:/Berkas/Code/CryptoNew/"
test_file = 'test_set.csv'
trained_model_name = 'finbert_optimizer_Adam_lr_2e-05_epochs_3_bs_6_maxlen_512'

test_df = pd.read_csv(os.path.join(absolute_path, 'Datasets', test_file))

# Load trained model and tokenizer
model_path = os.path.join(absolute_path, 'TrainedModels', trained_model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available, otherwise use CPU
model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Tokenize test data
tokenized_test = tokenizer(list(test_df['text']), padding=True, truncation=True, return_tensors='pt').to(device)

model.eval()
with torch.no_grad():
    inputs = {key: value.to(device) for key, value in tokenized_test.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    _, predicted_labels = torch.max(logits, 1)

test_df['finbert_adam_ft_prediction'] = predicted_labels.cpu().numpy()

# Save the test dataset with predictions
test_df.to_csv(os.path.join(absolute_path, 'Datasets', 'test_set_fin_adam.csv'), index=False)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Path ke file hasil prediksi
file_path = "D:/Berkas/Code/CryptoNew/Datasets/test_set_fin_adam.csv"

# Baca file
df = pd.read_csv(file_path)

# Ambil label asli dan prediksi
y_true = df['sentiment_numerical']
y_pred = df['finbert_adam_ft_prediction']

# Hitung metrik
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Tampilkan hasil
print(f"Accuracy :  {accuracy:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall   :  {recall:.4f}")
print(f"F1 Score :  {f1:.4f}")


Accuracy :  0.3460
Precision:  0.3503
Recall   :  0.3460
F1 Score :  0.3481
