# Model Comparison: Gemma 3 vs FinBERT

This notebook performs a comprehensive comparison between the Gemma 3 (with LoRA) and FinBERT models for financial sentiment analysis.

## Comparison Metrics
1. Accuracy
2. Precision, Recall, F1 Score
3. Cohen's Kappa
4. Matthews Correlation Coefficient
5. ROC-AUC Score
6. Confusion Matrix Analysis
7. Training Time and Resource Usage
8. Inference Speed

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    cohen_kappa_score,
    matthews_corrcoef,
    roc_auc_score,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import time

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

## Load Models and Data

In [None]:
# Load test data
test_data = pd.read_csv('../data/test.csv')
print(f"Loaded {len(test_data)} test samples")

# Initialize tokenizers
gemma_tokenizer = AutoTokenizer.from_pretrained('gemma-3-4b-pt')
finbert_tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')

# Load Gemma 3 model with LoRA adapter
from peft import PeftModel, PeftConfig
base_model = AutoModelForSequenceClassification.from_pretrained('gemma-3-4b-pt')
gemma_model = PeftModel.from_pretrained(base_model, '../models/gemma3/gemma3_lora_adapter_best')
gemma_model.eval()

# Load FinBERT model
finbert_model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert', num_labels=5)
finbert_model.load_state_dict(torch.load('../models/finbert/model.safetensors'))
finbert_model.eval()

# Map sentiment labels to numeric values (0-4)
sentiment_mapping = {
    'STRONGLY_NEGATIVE': 0,
    'NEGATIVE': 1,
    'NEUTRAL': 2,
    'POSITIVE': 3,
    'STRONGLY_POSITIVE': 4
}

# Convert sentiment labels to numeric
test_data['label'] = test_data['sentiment'].map(lambda x: sentiment_mapping.get(x, 2))

## Evaluation Functions

In [None]:
def evaluate_model(model, tokenizer, texts, labels, batch_size=32):
    predictions = []
    true_labels = []
    inference_times = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        batch_labels = labels[i:i + batch_size]
        
        # Tokenize
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
        
        # Measure inference time
        start_time = time.time()
        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
        inference_time = time.time() - start_time
        
        predictions.extend(preds.numpy())
        true_labels.extend(batch_labels)
        inference_times.extend([inference_time / len(batch_texts)] * len(batch_texts))
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_recall_fscore_support(true_labels, predictions, average='weighted')[0],
        'recall': precision_recall_fscore_support(true_labels, predictions, average='weighted')[1],
        'f1': precision_recall_fscore_support(true_labels, predictions, average='weighted')[2],
        'kappa': cohen_kappa_score(true_labels, predictions),
        'mcc': matthews_corrcoef(true_labels, predictions),
        'roc_auc': roc_auc_score(true_labels, predictions, multi_class='ovr'),
        'avg_inference_time': np.mean(inference_times),
        'confusion_matrix': confusion_matrix(true_labels, predictions)
    }
    
    return metrics

## Run Evaluation

In [None]:
# Evaluate both models
print("Evaluating Gemma 3 model...")
gemma_metrics = evaluate_model(gemma_model, gemma_tokenizer, test_data['description'], test_data['sentiment'])

print("\nEvaluating FinBERT model...")
finbert_metrics = evaluate_model(finbert_model, finbert_tokenizer, test_data['description'], test_data['sentiment'])

## Visualize Results

In [None]:
def plot_metrics_comparison(gemma_metrics, finbert_metrics):
    metrics = ['accuracy', 'precision', 'recall', 'f1', 'kappa', 'mcc', 'roc_auc']
    models = ['Gemma 3', 'FinBERT']
    
    plt.figure(figsize=(12, 6))
    x = np.arange(len(metrics))
    width = 0.35
    
    plt.bar(x - width/2, [gemma_metrics[m] for m in metrics], width, label='Gemma 3')
    plt.bar(x + width/2, [finbert_metrics[m] for m in metrics], width, label='FinBERT')
    
    plt.xlabel('Metrics')
    plt.ylabel('Score')
    plt.title('Model Performance Comparison')
    plt.xticks(x, metrics, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_confusion_matrices(gemma_metrics, finbert_metrics):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot Gemma 3 confusion matrix
    sns.heatmap(gemma_metrics['confusion_matrix'], annot=True, fmt='d', ax=ax1)
    ax1.set_title('Gemma 3 Confusion Matrix')
    
    # Plot FinBERT confusion matrix
    sns.heatmap(finbert_metrics['confusion_matrix'], annot=True, fmt='d', ax=ax2)
    ax2.set_title('FinBERT Confusion Matrix')
    
    plt.tight_layout()
    plt.show()

# Plot comparisons
plot_metrics_comparison(gemma_metrics, finbert_metrics)
plot_confusion_matrices(gemma_metrics, finbert_metrics)

## Print Detailed Results

In [None]:
def print_metrics(metrics, model_name):
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print(f"Cohen's Kappa: {metrics['kappa']:.4f}")
    print(f"Matthews Correlation Coefficient: {metrics['mcc']:.4f}")
    print(f"ROC-AUC Score: {metrics['roc_auc']:.4f}")
    print(f"Average Inference Time: {metrics['avg_inference_time']*1000:.2f} ms")

print_metrics(gemma_metrics, "Gemma 3")
print_metrics(finbert_metrics, "FinBERT")

## Analysis and Conclusions

In [None]:
def analyze_results(gemma_metrics, finbert_metrics):
    print("\nAnalysis:")
    
    # Compare accuracy
    acc_diff = gemma_metrics['accuracy'] - finbert_metrics['accuracy']
    print(f"Accuracy Difference: {acc_diff:.4f} ({'Gemma 3' if acc_diff > 0 else 'FinBERT'} better)")
    
    # Compare F1 scores
    f1_diff = gemma_metrics['f1'] - finbert_metrics['f1']
    print(f"F1 Score Difference: {f1_diff:.4f} ({'Gemma 3' if f1_diff > 0 else 'FinBERT'} better)")
    
    # Compare inference times
    time_diff = gemma_metrics['avg_inference_time'] - finbert_metrics['avg_inference_time']
    print(f"Inference Time Difference: {time_diff*1000:.2f} ms ({'FinBERT' if time_diff > 0 else 'Gemma 3'} faster)")
    
    # Analyze confusion matrices
    print("\nConfusion Matrix Analysis:")
    for model_name, metrics in [('Gemma 3', gemma_metrics), ('FinBERT', finbert_metrics)]:
        cm = metrics['confusion_matrix']
        print(f"\n{model_name}:")
        print(f"Most Confused Classes: {np.unravel_index(cm.argmax(), cm.shape)}")
        print(f"Diagonal Sum (Correct Predictions): {np.sum(np.diag(cm))}")

analyze_results(gemma_metrics, finbert_metrics)