In [None]:
# Mount Google Drive to access your dataset
from google.colab import drive
drive.mount('/content/drive')

print("✅ Google Drive mounted successfully!")
print("You can now access files in: /content/drive/MyDrive/")



In [3]:
# Install all necessary packages for FinBERT fine-tuning
!pip install transformers torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install datasets accelerate evaluate scikit-learn pandas matplotlib seaborn

print("\n✅ All packages installed successfully!")


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6

✅ All packages installed successfully!


In [4]:
# Import all necessary libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Transformers and Hugging Face
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import evaluate

print("✅ All libraries imported successfully!")




✅ All libraries imported successfully!


In [5]:
# Define paths for your dataset and model
# Update these paths according to where your files are stored


DATASET_PATH = "/content/drive/MyDrive/merged_financial_sentiment.json"  # Update this path



# Model name (FinBERT)
MODEL_NAME = "ProsusAI/finbert"  # Pre-trained FinBERT model

# Output directory for saving fine-tuned model
OUTPUT_DIR = "/content/drive/MyDrive/finetuned_finbert"  # Update this if needed

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"✅ Paths configured!")
print(f"Dataset path: {DATASET_PATH}")
print(f"Model: {MODEL_NAME}")
print(f"Output directory: {OUTPUT_DIR}")

✅ Paths configured!
Dataset path: /content/drive/MyDrive/merged_financial_sentiment.json
Model: ProsusAI/finbert
Output directory: /content/drive/MyDrive/finetuned_finbert


In [6]:
# Check if dataset file exists
if os.path.exists(DATASET_PATH):
    print(f"✅ Dataset found at: {DATASET_PATH}")

    # Check file size
    file_size = os.path.getsize(DATASET_PATH) / (1024 * 1024)  # Size in MB
    print(f"File size: {file_size:.2f} MB")
else:
    print(f"❌ Dataset not found at: {DATASET_PATH}")
    print("Please update the DATASET_PATH in the previous cell.")




✅ Dataset found at: /content/drive/MyDrive/merged_financial_sentiment.json
File size: 3.28 MB


In [7]:
# Load JSON dataset
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

def load_json_dataset(json_path):
    """Load JSON dataset (supports both JSON and JSONL formats)."""
    print(f"Loading dataset from: {json_path}")

    # Try to detect JSONL format (JSON Lines - one JSON object per line)
    data = []
    with open(json_path, 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        f.seek(0)  # Reset to beginning

        # Check if it's JSONL format
        if first_line.startswith('{') and not first_line.startswith('[{'):
            # JSONL format - read line by line
            print("Detected JSONL format (JSON Lines)")
            for line in f:
                line = line.strip()
                if line:  # Skip empty lines
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError as e:
                        print(f"Warning: Skipping invalid JSON line: {e}")
        else:
            # Standard JSON format
            data = json.load(f)

    # Convert to DataFrame
    if isinstance(data, list):
        df = pd.DataFrame(data)
    elif isinstance(data, dict):
        # If it's a dict, try to find the key that contains the list
        for key in ['data', 'records', 'items', 'samples']:
            if key in data and isinstance(data[key], list):
                df = pd.DataFrame(data[key])
                break
        else:
            df = pd.DataFrame([data])
    else:
        raise ValueError("JSON format not supported.")

    print(f"✅ Dataset loaded successfully!")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(df.head())

    return df

# Load your dataset
# Update these paths and column names according to your dataset
DATASET_PATH = "/content/drive/MyDrive/merged_financial_sentiment.json"  # Update this
TEXT_COLUMN = "text"  # Update this if your text column has a different name
LABEL_COLUMN = "label"  # Update this if your label column has a different name

df = load_json_dataset(DATASET_PATH)

# Display label distribution
print("\n" + "=" * 60)
print("LABEL DISTRIBUTION")
print("=" * 60)
print(df[LABEL_COLUMN].value_counts().sort_index())

Loading dataset from: /content/drive/MyDrive/merged_financial_sentiment.json
Detected JSONL format (JSON Lines)
✅ Dataset loaded successfully!
Dataset shape: (22989, 2)
Columns: ['text', 'label']

First few rows:
                                                text  label
0  According to Gran , the company has no plans t...      1
1  For the last quarter of 2010 , Componenta 's n...      2
2  In the third quarter of 2010 , net sales incre...      2
3  Operating profit rose to EUR 13.1 mn from EUR ...      2
4  Operating profit totalled EUR 21.1 mn , up fro...      2

LABEL DISTRIBUTION
label
0     3412
1     7989
2    11588
Name: count, dtype: int64


In [8]:
# Split dataset into train, validation, and test sets (80:10:10)
from sklearn.model_selection import train_test_split

def split_dataset(df, text_column, label_column, test_size=0.1, val_size=0.1, random_state=42):
    """
    Split dataset into train, validation, and test sets.

    Parameters:
    - df: DataFrame
    - text_column: name of text column
    - label_column: name of label column
    - test_size: proportion of test set (default 0.1 = 10%)
    - val_size: proportion of validation set from remaining data (default 0.1 = 10% of total)
    - random_state: random seed for reproducibility

    Returns:
    - train_df, val_df, test_df
    """

    # First split: separate test set (10%)
    # This leaves 90% for train+val
    df_temp, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df[label_column]  # Maintain class distribution
    )

    # Second split: separate validation set from remaining data
    # We want val_size of TOTAL, so from the 90% remaining:
    # val_size / (1 - test_size) = 0.1 / 0.9 ≈ 0.111
    val_proportion = val_size / (1 - test_size)

    train_df, val_df = train_test_split(
        df_temp,
        test_size=val_proportion,
        random_state=random_state,
        stratify=df_temp[label_column]  # Maintain class distribution
    )

    # Print split information
    print("=" * 60)
    print("DATASET SPLIT (80:10:10)")
    print("=" * 60)
    print(f"Total samples: {len(df)}")
    print(f"\nTrain set:   {len(train_df):6d} samples ({len(train_df)/len(df)*100:.1f}%)")
    print(f"Val set:     {len(val_df):6d} samples ({len(val_df)/len(df)*100:.1f}%)")
    print(f"Test set:    {len(test_df):6d} samples ({len(test_df)/len(df)*100:.1f}%)")

    print("\n" + "-" * 60)
    print("CLASS DISTRIBUTION IN EACH SPLIT:")
    print("-" * 60)

    print("\nTrain set:")
    print(train_df[label_column].value_counts().sort_index())

    print("\nValidation set:")
    print(val_df[label_column].value_counts().sort_index())

    print("\nTest set:")
    print(test_df[label_column].value_counts().sort_index())

    return train_df, val_df, test_df

# Split the dataset
train_df, val_df, test_df = split_dataset(
    df,
    text_column=TEXT_COLUMN,
    label_column=LABEL_COLUMN,
    test_size=0.1,
    val_size=0.1,
    random_state=42
)

DATASET SPLIT (80:10:10)
Total samples: 22989

Train set:    18391 samples (80.0%)
Val set:       2299 samples (10.0%)
Test set:      2299 samples (10.0%)

------------------------------------------------------------
CLASS DISTRIBUTION IN EACH SPLIT:
------------------------------------------------------------

Train set:
label
0    2730
1    6391
2    9270
Name: count, dtype: int64

Validation set:
label
0     341
1     799
2    1159
Name: count, dtype: int64

Test set:
label
0     341
1     799
2    1159
Name: count, dtype: int64


In [9]:
# Load FinBERT tokenizer
MODEL_NAME = "ProsusAI/finbert"

print(f"Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("✅ Tokenizer loaded successfully!")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Max model input length: {tokenizer.model_max_length}")

# Check tokenizer special tokens
print(f"\nSpecial tokens:")
print(f"  Padding token: {tokenizer.pad_token}")
print(f"  CLS token: {tokenizer.cls_token}")
print(f"  SEP token: {tokenizer.sep_token}")
print(f"  UNK token: {tokenizer.unk_token}")

Loading tokenizer: ProsusAI/finbert


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

✅ Tokenizer loaded successfully!
Tokenizer vocab size: 30522
Max model input length: 512

Special tokens:
  Padding token: [PAD]
  CLS token: [CLS]
  SEP token: [SEP]
  UNK token: [UNK]


In [10]:
# Tokenize the datasets
def tokenize_function(examples, text_column, tokenizer, max_length=512):
    """
    Tokenize the text data.

    Parameters:
    - examples: dictionary containing text data
    - text_column: name of the text column
    - tokenizer: tokenizer object
    - max_length: maximum sequence length

    Returns:
    - Tokenized examples
    """
    return tokenizer(
        examples[text_column],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors=None  # Return lists, not tensors
    )

# Convert DataFrames to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

print("✅ Datasets converted to Hugging Face format")
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Val dataset: {len(val_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")

# Tokenize all datasets
MAX_LENGTH = 512  # Adjust based on your data and model requirements

print(f"\nTokenizing datasets with max_length={MAX_LENGTH}...")
train_tokenized = train_dataset.map(
    lambda x: tokenize_function(x, TEXT_COLUMN, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=train_dataset.column_names  # Remove original columns
)

val_tokenized = val_dataset.map(
    lambda x: tokenize_function(x, TEXT_COLUMN, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=val_dataset.column_names
)

test_tokenized = test_dataset.map(
    lambda x: tokenize_function(x, TEXT_COLUMN, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=test_dataset.column_names
)

print("✅ Tokenization completed!")

✅ Datasets converted to Hugging Face format
Train dataset: 18391 samples
Val dataset: 2299 samples
Test dataset: 2299 samples

Tokenizing datasets with max_length=512...


Map:   0%|          | 0/18391 [00:00<?, ? examples/s]

Map:   0%|          | 0/2299 [00:00<?, ? examples/s]

Map:   0%|          | 0/2299 [00:00<?, ? examples/s]

✅ Tokenization completed!


In [11]:
# Step 1: Define prepare_labels function
def prepare_labels(df, label_column, label_to_id=None):
    """
    Prepare labels for the dataset. Creates label_to_id mapping if not provided.
    """
    if label_to_id is None:
        unique_labels = sorted(df[label_column].unique())
        label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
        print(f"Label mapping: {label_to_id}")
        print(f"Number of classes: {len(label_to_id)}")

    # Map labels to IDs
    labels = [label_to_id[label] for label in df[label_column]]

    return labels, label_to_id

# Step 2: Now tokenize datasets (keep the label column)
train_tokenized = train_dataset.map(
    lambda x: tokenize_function(x, TEXT_COLUMN, tokenizer, MAX_LENGTH),
    batched=True,
    batch_size=1000,
    remove_columns=[col for col in train_dataset.column_names if col != LABEL_COLUMN]  # Keep label column
)

val_tokenized = val_dataset.map(
    lambda x: tokenize_function(x, TEXT_COLUMN, tokenizer, MAX_LENGTH),
    batched=True,
    batch_size=1000,
    remove_columns=[col for col in val_dataset.column_names if col != LABEL_COLUMN]
)

test_tokenized = test_dataset.map(
    lambda x: tokenize_function(x, TEXT_COLUMN, tokenizer, MAX_LENGTH),
    batched=True,
    batch_size=1000,
    remove_columns=[col for col in test_dataset.column_names if col != LABEL_COLUMN]
)

# Step 3: Prepare labels and convert to IDs (now prepare_labels is defined)
train_labels, label_to_id = prepare_labels(train_df, LABEL_COLUMN)
val_labels, _ = prepare_labels(val_df, LABEL_COLUMN, label_to_id)
test_labels, _ = prepare_labels(test_df, LABEL_COLUMN, label_to_id)

# Step 4: Replace the label column with numeric IDs
train_tokenized = train_tokenized.remove_columns([LABEL_COLUMN]).add_column("labels", train_labels)
val_tokenized = val_tokenized.remove_columns([LABEL_COLUMN]).add_column("labels", val_labels)
test_tokenized = test_tokenized.remove_columns([LABEL_COLUMN]).add_column("labels", test_labels)

print("✅ Tokenization and label conversion completed!")

Map:   0%|          | 0/18391 [00:00<?, ? examples/s]

Map:   0%|          | 0/2299 [00:00<?, ? examples/s]

Map:   0%|          | 0/2299 [00:00<?, ? examples/s]

Label mapping: {np.int64(0): 0, np.int64(1): 1, np.int64(2): 2}
Number of classes: 3
✅ Tokenization and label conversion completed!


In [12]:
# Define path for saving tokenized datasets
TOKENIZED_DATA_DIR = "/content/drive/MyDrive/finetuned_finbert/tokenized_datasets"
os.makedirs(TOKENIZED_DATA_DIR, exist_ok=True)

# Create DatasetDict and save to Google Drive
dataset_dict = DatasetDict({
    "train": train_tokenized,
    "validation": val_tokenized,
    "test": test_tokenized
})

dataset_dict.save_to_disk(TOKENIZED_DATA_DIR)
print(f"✅ Tokenized datasets saved to: {TOKENIZED_DATA_DIR}")

# Save label mapping
import pickle
with open(f"{TOKENIZED_DATA_DIR}/label_to_id.pkl", 'wb') as f:
    pickle.dump(label_to_id, f)
print(f"✅ Label mapping saved")

Saving the dataset (0/1 shards):   0%|          | 0/18391 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2299 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2299 [00:00<?, ? examples/s]

✅ Tokenized datasets saved to: /content/drive/MyDrive/finetuned_finbert/tokenized_datasets
✅ Label mapping saved


In [13]:
# Load FinBERT model and tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

MODEL_NAME = "ProsusAI/finbert"

print(f"Loading model: {MODEL_NAME}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load label mapping to get number of classes
import pickle
TOKENIZED_DATA_DIR = "/content/drive/MyDrive/finetuned_finbert/tokenized_datasets"
with open(f"{TOKENIZED_DATA_DIR}/label_to_id.pkl", 'rb') as f:
    label_to_id = pickle.load(f)

NUM_LABELS = len(label_to_id)
print(f"Number of classes: {NUM_LABELS}")
print(f"Label mapping: {label_to_id}")

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="single_label_classification"  # For multi-class classification
)

print("✅ Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Loading model: ProsusAI/finbert
Number of classes: 3
Label mapping: {np.int64(0): 0, np.int64(1): 1, np.int64(2): 2}


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

✅ Model loaded successfully!
Model parameters: 109,484,547
Trainable parameters: 109,484,547


In [14]:
# Training Configuration and Hyperparameters
TRAINING_CONFIG = {
    # Model paths
    "output_dir": "/content/drive/MyDrive/finetuned_finbert/model_output",
    "logging_dir": "/content/drive/MyDrive/finetuned_finbert/logs",

    # Hyperparameters
    "learning_rate": 2e-5,  # Typical range: 2e-5 to 5e-5 for transformers
    "num_train_epochs": 5,  # Adjust based on your dataset size and needs
    "per_device_train_batch_size": 16,  # Adjust based on GPU memory (8, 16, 32)
    "per_device_eval_batch_size": 16,
    "weight_decay": 0.01,  # L2 regularization
    "warmup_steps": 500,  # Number of warmup steps for learning rate scheduler
    "warmup_ratio": 0.1,  # Alternative to warmup_steps (ratio of total training steps)

    # Learning rate schedule
    "lr_scheduler_type": "linear",  # Options: linear, cosine, cosine_with_restarts, polynomial, constant, constant_with_warmup

    # Optimizer
    "optim": "adamw_torch",  # AdamW optimizer

    # Training settings
    "logging_steps": 100,  # Log every N steps
    "eval_strategy": "epoch",  # Evaluate after each epoch (options: "steps", "epoch", "no")
    "save_strategy": "epoch",  # Save checkpoint after each epoch
    "save_total_limit": 3,  # Keep only the last 3 checkpoints
    "load_best_model_at_end": True,  # Load best model at the end
    "metric_for_best_model": "f1",  # Metric to use for best model selection
    "greater_is_better": True,  # Whether higher metric is better

    # Other settings
    "seed": 42,  # For reproducibility
    "fp16": True,  # Use mixed precision training (faster, less memory)
    "dataloader_num_workers": 2,  # Number of workers for data loading
    "report_to": "tensorboard",  # Logging: tensorboard, wandb, etc.
    "push_to_hub": False,  # Set to True if you want to push to Hugging Face Hub
}

print("=" * 60)
print("TRAINING CONFIGURATION")
print("=" * 60)
for key, value in TRAINING_CONFIG.items():
    print(f"{key}: {value}")
print("=" * 60)

TRAINING CONFIGURATION
output_dir: /content/drive/MyDrive/finetuned_finbert/model_output
logging_dir: /content/drive/MyDrive/finetuned_finbert/logs
learning_rate: 2e-05
num_train_epochs: 5
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
weight_decay: 0.01
warmup_steps: 500
warmup_ratio: 0.1
lr_scheduler_type: linear
optim: adamw_torch
logging_steps: 100
eval_strategy: epoch
save_strategy: epoch
save_total_limit: 3
load_best_model_at_end: True
metric_for_best_model: f1
greater_is_better: True
seed: 42
fp16: True
dataloader_num_workers: 2
report_to: tensorboard
push_to_hub: False


In [15]:
# Create TrainingArguments object
training_args = TrainingArguments(
    output_dir=TRAINING_CONFIG["output_dir"],
    logging_dir=TRAINING_CONFIG["logging_dir"],

    # Hyperparameters
    learning_rate=TRAINING_CONFIG["learning_rate"],
    num_train_epochs=TRAINING_CONFIG["num_train_epochs"],
    per_device_train_batch_size=TRAINING_CONFIG["per_device_train_batch_size"],
    per_device_eval_batch_size=TRAINING_CONFIG["per_device_eval_batch_size"],
    weight_decay=TRAINING_CONFIG["weight_decay"],
    warmup_steps=TRAINING_CONFIG["warmup_steps"],

    # Learning rate scheduler
    lr_scheduler_type=TRAINING_CONFIG["lr_scheduler_type"],

    # Optimizer
    optim=TRAINING_CONFIG["optim"],

    # Training settings
    logging_steps=TRAINING_CONFIG["logging_steps"],
    eval_strategy=TRAINING_CONFIG["eval_strategy"],
    save_strategy=TRAINING_CONFIG["save_strategy"],
    save_total_limit=TRAINING_CONFIG["save_total_limit"],
    load_best_model_at_end=TRAINING_CONFIG["load_best_model_at_end"],
    metric_for_best_model=TRAINING_CONFIG["metric_for_best_model"],
    greater_is_better=TRAINING_CONFIG["greater_is_better"],

    # Other settings
    seed=TRAINING_CONFIG["seed"],
    fp16=TRAINING_CONFIG["fp16"],
    dataloader_num_workers=TRAINING_CONFIG["dataloader_num_workers"],
    report_to=TRAINING_CONFIG["report_to"],
    push_to_hub=TRAINING_CONFIG["push_to_hub"],

    # Remove unused columns
    remove_unused_columns=False,
)

print("✅ Training arguments created successfully!")
print(f"Output directory: {training_args.output_dir}")
print(f"Total training steps will be calculated after loading dataset")

✅ Training arguments created successfully!
Output directory: /content/drive/MyDrive/finetuned_finbert/model_output
Total training steps will be calculated after loading dataset


In [16]:
# Define metrics for evaluation
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

# Load metric from evaluate library
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    """
    Compute metrics for evaluation.

    Parameters:
    - eval_pred: tuple of predictions and labels

    Returns:
    - Dictionary of metrics
    """
    predictions, labels = eval_pred

    # Get predicted class (argmax)
    predictions = np.argmax(predictions, axis=1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")  # Weighted F1 for imbalanced datasets
    f1_macro = f1_score(labels, predictions, average="macro")  # Macro F1
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")

    return {
        "accuracy": accuracy,
        "f1": f1,
        "f1_macro": f1_macro,
        "precision": precision,
        "recall": recall,
    }

print("✅ Metrics function defined successfully!")
print("Metrics: accuracy, f1 (weighted), f1_macro, precision, recall")

Downloading builder script: 0.00B [00:00, ?B/s]

✅ Metrics function defined successfully!
Metrics: accuracy, f1 (weighted), f1_macro, precision, recall


In [17]:
# Load tokenized datasets from Google Drive
from datasets import load_from_disk

# Load datasets
dataset_dict = load_from_disk(TOKENIZED_DATA_DIR)

print("=" * 60)
print("DATASET LOADED")
print("=" * 60)
print(f"Train: {len(dataset_dict['train'])} samples")
print(f"Validation: {len(dataset_dict['validation'])} samples")
print(f"Test: {len(dataset_dict['test'])} samples")
print(f"Features: {dataset_dict['train'].features}")

# Calculate total training steps
total_steps = len(dataset_dict['train']) // TRAINING_CONFIG["per_device_train_batch_size"] * TRAINING_CONFIG["num_train_epochs"]
print(f"\nTotal training steps: {total_steps}")
print(f"Warmup steps: {TRAINING_CONFIG['warmup_steps']}")
print(f"Warmup ratio: {TRAINING_CONFIG['warmup_steps'] / total_steps * 100:.2f}%")
print("=" * 60)

DATASET LOADED
Train: 18391 samples
Validation: 2299 samples
Test: 2299 samples
Features: {'input_ids': List(Value('int32')), 'token_type_ids': List(Value('int8')), 'attention_mask': List(Value('int8')), 'labels': Value('int64')}

Total training steps: 5745
Warmup steps: 500
Warmup ratio: 8.70%


In [18]:
# Verify all configurations
print("=" * 60)
print("TRAINING CONFIGURATION SUMMARY")
print("=" * 60)

print(f"\nModel:")
print(f"  Name: {MODEL_NAME}")
print(f"  Number of labels: {NUM_LABELS}")
print(f"  Label mapping: {label_to_id}")

print(f"\nHyperparameters:")
print(f"  Learning rate: {TRAINING_CONFIG['learning_rate']}")
print(f"  Epochs: {TRAINING_CONFIG['num_train_epochs']}")
print(f"  Batch size (train): {TRAINING_CONFIG['per_device_train_batch_size']}")
print(f"  Batch size (eval): {TRAINING_CONFIG['per_device_eval_batch_size']}")
print(f"  Weight decay: {TRAINING_CONFIG['weight_decay']}")
print(f"  Warmup steps: {TRAINING_CONFIG['warmup_steps']}")

print(f"\nOptimizer & Scheduler:")
print(f"  Optimizer: {TRAINING_CONFIG['optim']}")
print(f"  LR Scheduler: {TRAINING_CONFIG['lr_scheduler_type']}")

print(f"\nLoss Function:")
print(f"  Type: CrossEntropyLoss (for classification)")
print(f"  Problem type: single_label_classification")

print(f"\nTraining Settings:")
print(f"  Output directory: {TRAINING_CONFIG['output_dir']}")
print(f"  Logging directory: {TRAINING_CONFIG['logging_dir']}")
print(f"  Evaluation strategy: {TRAINING_CONFIG['eval_strategy']}")
print(f"  Save strategy: {TRAINING_CONFIG['save_strategy']}")
print(f"  FP16: {TRAINING_CONFIG['fp16']}")

print(f"\nDataset:")
print(f"  Train samples: {len(dataset_dict['train'])}")
print(f"  Validation samples: {len(dataset_dict['validation'])}")
print(f"  Test samples: {len(dataset_dict['test'])}")

print(f"\nGPU:")
print(f"  Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  Device: {torch.cuda.get_device_name(0)}")
    print(f"  Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

print("\n✅ All configurations verified!")
print("=" * 60)

TRAINING CONFIGURATION SUMMARY

Model:
  Name: ProsusAI/finbert
  Number of labels: 3
  Label mapping: {np.int64(0): 0, np.int64(1): 1, np.int64(2): 2}

Hyperparameters:
  Learning rate: 2e-05
  Epochs: 5
  Batch size (train): 16
  Batch size (eval): 16
  Weight decay: 0.01
  Warmup steps: 500

Optimizer & Scheduler:
  Optimizer: adamw_torch
  LR Scheduler: linear

Loss Function:
  Type: CrossEntropyLoss (for classification)
  Problem type: single_label_classification

Training Settings:
  Output directory: /content/drive/MyDrive/finetuned_finbert/model_output
  Logging directory: /content/drive/MyDrive/finetuned_finbert/logs
  Evaluation strategy: epoch
  Save strategy: epoch
  FP16: True

Dataset:
  Train samples: 18391
  Validation samples: 2299
  Test samples: 2299

GPU:
  Available: True
  Device: Tesla T4
  Memory: 15.83 GB

✅ All configurations verified!


In [19]:
# Create data collator for dynamic padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

print("✅ Data collator created successfully!")

✅ Data collator created successfully!


In [20]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("✅ Trainer initialized successfully!")
print(f"Train dataset: {len(trainer.train_dataset)} samples")
print(f"Eval dataset: {len(trainer.eval_dataset)} samples")

✅ Trainer initialized successfully!
Train dataset: 18391 samples
Eval dataset: 2299 samples


In [21]:
# Train the model
print("=" * 60)
print("STARTING TRAINING")
print("=" * 60)
print(f"Training for {TRAINING_CONFIG['num_train_epochs']} epochs...")
print(f"Total training steps: {len(trainer.get_train_dataloader()) * TRAINING_CONFIG['num_train_epochs']}")
print("=" * 60)

# Train
train_results = trainer.train()

print("\n" + "=" * 60)
print("TRAINING COMPLETED")
print("=" * 60)
print(f"Training loss: {train_results.training_loss:.4f}")
print(f"Training runtime: {train_results.metrics['train_runtime']:.2f} seconds")
print(f"Training samples per second: {train_results.metrics['train_samples_per_second']:.2f}")
print("=" * 60)



#########################################################################################


# Evaluate on test set
print("=" * 60)
print("EVALUATING ON TEST SET")
print("=" * 60)

test_results = trainer.evaluate(eval_dataset=dataset_dict["test"])

print("\nTest Results:")
print("-" * 60)
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")
print("-" * 60)

# Extract key metrics
test_loss = test_results.get("eval_loss", None)
test_accuracy = test_results.get("eval_accuracy", None)
test_f1 = test_results.get("eval_f1", None)
test_f1_macro = test_results.get("eval_f1_macro", None)
test_precision = test_results.get("eval_precision", None)
test_recall = test_results.get("eval_recall", None)

print(f"\nKey Metrics:")
print(f"  Test Loss: {test_loss:.4f}" if test_loss else "  Test Loss: N/A")
print(f"  Accuracy: {test_accuracy:.4f}" if test_accuracy else "  Accuracy: N/A")
print(f"  F1 (weighted): {test_f1:.4f}" if test_f1 else "  F1 (weighted): N/A")
print(f"  F1 (macro): {test_f1_macro:.4f}" if test_f1_macro else "  F1 (macro): N/A")
print(f"  Precision: {test_precision:.4f}" if test_precision else "  Precision: N/A")
print(f"  Recall: {test_recall:.4f}" if test_recall else "  Recall: N/A")
print("=" * 60)



# Get predictions on test set for detailed analysis
print("=" * 60)
print("GENERATING PREDICTIONS")
print("=" * 60)

# Get predictions
test_predictions = trainer.predict(dataset_dict["test"])

# Extract predictions and labels
predictions = np.argmax(test_predictions.predictions, axis=1)
labels = test_predictions.label_ids

# Create classification report
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report:")
print("-" * 60)
# Create id_to_label mapping
id_to_label = {v: k for k, v in label_to_id.items()}
target_names = [f"Class {id_to_label[i]}" for i in range(len(label_to_id))]
print(classification_report(labels, predictions, target_names=target_names, digits=4))
print("-" * 60)

# Confusion matrix
print("\nConfusion Matrix:")
print("-" * 60)
cm = confusion_matrix(labels, predictions)
print(cm)
print("-" * 60)


########################################################################################


# Visualize training curves from logs
import json
import matplotlib.pyplot as plt
from pathlib import Path

def plot_training_curves(log_dir):
    """Plot training curves from training logs."""
    log_file = Path(log_dir) / "trainer_state.json"

    if not log_file.exists():
        print(f"Log file not found: {log_file}")
        return

    # Load training logs
    with open(log_file, 'r') as f:
        logs = json.load(f)

    # Extract metrics
    log_history = logs.get("log_history", [])

    # Separate training and evaluation metrics
    train_losses = []
    eval_losses = []
    eval_accuracies = []
    eval_f1_scores = []
    steps = []

    for entry in log_history:
        if "loss" in entry and "eval_loss" not in entry:
            train_losses.append(entry["loss"])
            if "step" in entry:
                steps.append(entry["step"])
        if "eval_loss" in entry:
            eval_losses.append(entry["eval_loss"])
            if "eval_accuracy" in entry:
                eval_accuracies.append(entry["eval_accuracy"])
            if "eval_f1" in entry:
                eval_f1_scores.append(entry["eval_f1"])

    # Create plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Training loss
    if train_losses:
        axes[0, 0].plot(train_losses, label="Training Loss")
        axes[0, 0].set_xlabel("Step")
        axes[0, 0].set_ylabel("Loss")
        axes[0, 0].set_title("Training Loss")
        axes[0, 0].legend()
        axes[0, 0].grid(True)

    # Validation loss
    if eval_losses:
        axes[0, 1].plot(eval_losses, label="Validation Loss", color="orange")
        axes[0, 1].set_xlabel("Epoch")
        axes[0, 1].set_ylabel("Loss")
        axes[0, 1].set_title("Validation Loss")
        axes[0, 1].legend()
        axes[0, 1].grid(True)

    # Validation accuracy
    if eval_accuracies:
        axes[1, 0].plot(eval_accuracies, label="Validation Accuracy", color="green")
        axes[1, 0].set_xlabel("Epoch")
        axes[1, 0].set_ylabel("Accuracy")
        axes[1, 0].set_title("Validation Accuracy")
        axes[1, 0].legend()
        axes[1, 0].grid(True)

    # Validation F1
    if eval_f1_scores:
        axes[1, 1].plot(eval_f1_scores, label="Validation F1", color="red")
        axes[1, 1].set_xlabel("Epoch")
        axes[1, 1].set_ylabel("F1 Score")
        axes[1, 1].set_title("Validation F1 Score")
        axes[1, 1].legend()
        axes[1, 1].grid(True)

    plt.tight_layout()
    plt.show()

    print("✅ Training curves plotted successfully!")

# Plot training curves
plot_training_curves(TRAINING_CONFIG["output_dir"])


####################################################################################


# Save the final model
print("=" * 60)
print("SAVING FINAL MODEL")
print("=" * 60)

FINAL_MODEL_DIR = "/content/drive/MyDrive/finetuned_finbert/final_model"

# Save model and tokenizer
trainer.save_model(FINAL_MODEL_DIR)
tokenizer.save_pretrained(FINAL_MODEL_DIR)

# Save label mapping
import pickle
with open(f"{FINAL_MODEL_DIR}/label_to_id.pkl", 'wb') as f:
    pickle.dump(label_to_id, f)

# Save id_to_label mapping
id_to_label = {v: k for k, v in label_to_id.items()}
with open(f"{FINAL_MODEL_DIR}/id_to_label.pkl", 'wb') as f:
    pickle.dump(id_to_label, f)

print(f"✅ Model saved to: {FINAL_MODEL_DIR}")
print(f"✅ Tokenizer saved to: {FINAL_MODEL_DIR}")
print(f"✅ Label mappings saved")
print("=" * 60)

#####################################################################################

# Print summary of all results
print("=" * 60)
print("TRAINING SUMMARY")
print("=" * 60)

print(f"\nTraining:")
print(f"  Training Loss: {train_results.training_loss:.4f}")
print(f"  Training Time: {train_results.metrics['train_runtime']:.2f} seconds")

print(f"\nValidation:")
if val_loss:
    print(f"  Validation Loss: {val_loss:.4f}")
if val_accuracy:
    print(f"  Accuracy: {val_accuracy:.4f}")
if val_f1:
    print(f"  F1 (weighted): {val_f1:.4f}")
if val_f1_macro:
    print(f"  F1 (macro): {val_f1_macro:.4f}")

print(f"\nTest:")
if test_loss:
    print(f"  Test Loss: {test_loss:.4f}")
if test_accuracy:
    print(f"  Accuracy: {test_accuracy:.4f}")
if test_f1:
    print(f"  F1 (weighted): {test_f1:.4f}")
if test_f1_macro:
    print(f"  F1 (macro): {test_f1_macro:.4f}")

print(f"\nModel saved to: {FINAL_MODEL_DIR}")
print("=" * 60)

STARTING TRAINING
Training for 5 epochs...
Total training steps: 5750


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro,Precision,Recall
1,0.4188,0.3987,0.84254,0.845188,0.821395,0.854827,0.84254
2,0.2953,0.353609,0.865594,0.865972,0.844718,0.869474,0.865594
3,0.1813,0.453924,0.859069,0.860959,0.837609,0.867376,0.859069
4,0.1251,0.497278,0.870378,0.871961,0.847258,0.877657,0.870378
5,0.0635,0.548078,0.872118,0.873144,0.848972,0.878107,0.872118



TRAINING COMPLETED
Training loss: 0.2755
Training runtime: 2345.79 seconds
Training samples per second: 39.20
EVALUATING ON TEST SET



Test Results:
------------------------------------------------------------
eval_loss: 0.6084
eval_accuracy: 0.8673
eval_f1: 0.8675
eval_f1_macro: 0.8447
eval_precision: 0.8697
eval_recall: 0.8673
eval_runtime: 18.1064
eval_samples_per_second: 126.9710
eval_steps_per_second: 7.9530
epoch: 5.0000
------------------------------------------------------------

Key Metrics:
  Test Loss: 0.6084
  Accuracy: 0.8673
  F1 (weighted): 0.8675
  F1 (macro): 0.8447
  Precision: 0.8697
  Recall: 0.8673
GENERATING PREDICTIONS

Classification Report:
------------------------------------------------------------
              precision    recall  f1-score   support

     Class 0     0.7395    0.8240    0.7795       341
     Class 1     0.8875    0.8098    0.8469       799
     Class 2     0.8958    0.9198    0.9076      1159

    accuracy                         0.8673      2299
   macro avg     0.8409    0.8512    0.8447      2299
weighted avg     0.8697    0.8673    0.8675      2299

------------------

NameError: name 'val_loss' is not defined