In [None]:
# 04b_model_finetuning.ipynb
# Purpose: Fine-tune FinBERT models using manually labeled data
# Compatible with older Transformers library versions
# Trains multiple models: ProsusAI/finbert and yiyanghkust/finbert-tone

print("="*70)
print("COMPLETE FINBERT FINE-TUNING PIPELINE")
print("="*70)

## 1. INITIAL SETUP AND IMPORTS

# Disable wandb logging to prevent API key prompts
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

# Core imports
import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Transformers and ML libraries
import transformers
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from torch.utils.data import Dataset
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

print(f"Libraries loaded successfully")
print(f"Transformers version: {transformers.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


COMPLETE FINBERT FINE-TUNING PIPELINE
Libraries loaded successfully
Transformers version: 4.56.1
PyTorch version: 2.8.0+cu126
CUDA available: False


In [None]:
## 2. MOUNT DRIVE AND LOAD CONFIGURATION

from google.colab import drive
drive.mount("/content/drive")

# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
try:
    with open(config_path, "r") as f:
        config = json.load(f)

    SEED = config["SEED"]
    BANK_CODE = config["BANK_CODE"]
    drive_base = Path(config["drive_base"])
    colab_base = Path(config["colab_base"])

    print(f"Configuration loaded for bank: {BANK_CODE.upper()}")
except Exception as e:
    print(f"Configuration loading failed: {e}")
    # Fallback configuration
    SEED = 42
    BANK_CODE = "jpm"
    drive_base = Path("/content/drive/MyDrive/CAM_DS_AI_Project")
    print("Using fallback configuration")

# Set seeds for reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Configuration loaded for bank: JPM


In [None]:
## 3. DEFINE PATHS

manual_validation_path = drive_base / "data/manual_validation/jpm"
results_sentiment_path = drive_base / "results/sentiment/jpm"
models_path = drive_base / "models"
finetuned_models_path = models_path / "finetuned"

# Ensure directories exist
finetuned_models_path.mkdir(parents=True, exist_ok=True)

print(f"Paths configured:")
print(f"  Data path: {manual_validation_path}")
print(f"  Models path: {finetuned_models_path}")


Paths configured:
  Data path: /content/drive/MyDrive/CAM_DS_AI_Project/data/manual_validation/jpm
  Models path: /content/drive/MyDrive/CAM_DS_AI_Project/models/finetuned


In [None]:
## 4. DATA CLASSES

class SentimentDataset(Dataset):
    """Custom dataset for sentiment analysis fine-tuning."""

    def __init__(self, texts: List[str], labels: List[str], tokenizer, max_length: int = 128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Create label mapping
        unique_labels = list(set(labels))
        self.label_to_id = {label: i for i, label in enumerate(sorted(unique_labels))}
        self.id_to_label = {i: label for label, i in self.label_to_id.items()}

        print(f"Dataset created with {len(texts)} samples")
        print(f"Label mapping: {self.label_to_id}")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize with proper error handling
        try:
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
        except Exception as e:
            print(f"Tokenization error for sample {idx}: {e}")
            # Return empty encoding as fallback
            encoding = self.tokenizer(
                "",
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.label_to_id[label], dtype=torch.long)
        }


In [None]:
## 5. DATA CREATION AND LOADING

def create_comprehensive_sample_data():
    """Create comprehensive sample training data with clear sentiment labels."""

    print("\n" + "="*50)
    print("CREATING COMPREHENSIVE TRAINING DATA")
    print("="*50)

    # Comprehensive financial sentiment examples
    sample_data = [
        # Strong Positive Examples (10)
        ("Outstanding quarterly earnings beat all analyst expectations significantly", "positive"),
        ("Record-breaking revenue growth of 28% drives exceptional shareholder value", "positive"),
        ("Exceptional profit margins demonstrate strong operational excellence", "positive"),
        ("Successful strategic acquisition strengthens competitive market position", "positive"),
        ("Robust capital ratios exceed regulatory requirements substantially", "positive"),
        ("Digital transformation delivers remarkable cost savings and efficiency gains", "positive"),
        ("Strong balance sheet enables ambitious growth investment strategy", "positive"),
        ("Market leadership position accelerates revenue growth momentum", "positive"),
        ("Innovation in financial services drives customer acquisition success", "positive"),
        ("Excellent credit quality reflects prudent risk management practices", "positive"),

        # Strong Negative Examples (10)
        ("Massive quarterly losses disappoint investors and analysts badly", "negative"),
        ("Revenue declined sharply by 22% due to severe market challenges", "negative"),
        ("Significant credit losses impact profitability and capital ratios severely", "negative"),
        ("Regulatory fines create substantial financial burden and reputational damage", "negative"),
        ("Poor asset quality deterioration raises serious going concern issues", "negative"),
        ("Management warns of extremely challenging business outlook ahead", "negative"),
        ("Competitive pressures severely squeeze margins across all business lines", "negative"),
        ("Economic downturn threatens core business operations and viability", "negative"),
        ("Substantial loan loss provisions reflect deteriorating credit environment", "negative"),
        ("Market volatility impacts trading revenue catastrophically this quarter", "negative"),

        # Clear Neutral Examples (10)
        ("Company reports mixed quarterly results with some positive developments", "neutral"),
        ("Interest rate environment presents both opportunities and challenges", "neutral"),
        ("Management maintains cautiously optimistic but measured business outlook", "neutral"),
        ("Market conditions remain volatile but generally manageable for operations", "neutral"),
        ("Quarterly results align closely with previously issued management guidance", "neutral"),
        ("Regulatory environment continues evolving with new compliance requirements", "neutral"),
        ("Strategic business review process expected to conclude next quarter", "neutral"),
        ("Digital banking initiatives proceed according to established timeline", "neutral"),
        ("Board of directors evaluates various strategic alternatives methodically", "neutral"),
        ("Economic indicators suggest stable but uncertain operating environment", "neutral")
    ]

    # Create DataFrame
    texts, labels = zip(*sample_data)
    sample_df = pd.DataFrame({
        'text': texts,
        'human_label': labels
    })

    # Shuffle data and create train/validation split
    sample_df = sample_df.sample(frac=1, random_state=SEED).reset_index(drop=True)

    # 75% train, 25% validation
    train_size = int(0.75 * len(sample_df))
    train_df = sample_df.iloc[:train_size].reset_index(drop=True)
    val_df = sample_df.iloc[train_size:].reset_index(drop=True)

    print(f"Training data shape: {train_df.shape}")
    print(f"Validation data shape: {val_df.shape}")

    # Display label distributions
    print("\nLabel distributions:")
    print("Training:", dict(train_df['human_label'].value_counts()))
    print("Validation:", dict(val_df['human_label'].value_counts()))

    return train_df, val_df

def load_or_create_training_data():
    """Load existing training data or create sample data."""

    print("\n" + "="*50)
    print("LOADING TRAINING DATA")
    print("="*50)

    train_path = manual_validation_path / "train_manual_labels.csv"
    val_path = manual_validation_path / "val_manual_labels.csv"

    # Check if files exist
    if manual_validation_path.exists():
        print(f"Directory exists: {manual_validation_path}")
        files = list(manual_validation_path.iterdir())
        if files:
            print("Files found:")
            for file in files:
                print(f"  - {file.name}")
        else:
            print("Directory is empty")
    else:
        print(f"Directory does not exist: {manual_validation_path}")

    # Try to load existing data
    train_df = None
    val_df = None

    if train_path.exists() and val_path.exists():
        try:
            train_df = pd.read_csv(train_path)
            val_df = pd.read_csv(val_path)

            # Validate required columns
            if 'text' in train_df.columns and 'human_label' in train_df.columns:
                print(f"Loaded existing training data: {train_df.shape}")
                print(f"Loaded existing validation data: {val_df.shape}")
            else:
                print("Existing files missing required columns")
                train_df = None
                val_df = None

        except Exception as e:
            print(f"Error loading existing data: {e}")
            train_df = None
            val_df = None

    # Create sample data if needed
    if train_df is None or val_df is None:
        print("Creating comprehensive sample data...")
        train_df, val_df = create_comprehensive_sample_data()

    return train_df, val_df


In [None]:
## 6. DATA PREPARATION

def prepare_training_data(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Tuple[Dict, Dict]:
    """Prepare and process training data."""

    print("\n" + "="*50)
    print("PREPARING TRAINING DATA")
    print("="*50)

    # Extract texts and labels
    train_texts = train_df['text'].astype(str).tolist()
    train_labels = train_df['human_label'].tolist()
    val_texts = val_df['text'].astype(str).tolist()
    val_labels = val_df['human_label'].tolist()

    print(f"Training samples: {len(train_texts)}")
    print(f"Validation samples: {len(val_texts)}")

    # Analyze label distribution
    print(f"\nTraining label distribution:")
    train_label_counts = pd.Series(train_labels).value_counts()
    for label, count in train_label_counts.items():
        pct = (count / len(train_labels)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    # Calculate class weights for balanced training
    unique_labels = list(set(train_labels))
    try:
        class_weights = compute_class_weight(
            class_weight='balanced',
            classes=np.array(unique_labels),
            y=np.array(train_labels)
        )
        class_weight_dict = {label: weight for label, weight in zip(unique_labels, class_weights)}
    except Exception as e:
        print(f"Error computing class weights: {e}")
        class_weight_dict = {label: 1.0 for label in unique_labels}

    print(f"Class weights: {class_weight_dict}")

    training_data = {
        'texts': train_texts,
        'labels': train_labels,
        'class_weights': class_weight_dict,
        'label_distribution': train_label_counts.to_dict()
    }

    validation_data = {
        'texts': val_texts,
        'labels': val_labels
    }

    return training_data, validation_data


In [None]:
## 7. TRAINING CONFIGURATION

def compute_metrics(eval_pred):
    """Compute evaluation metrics."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate metrics with zero division handling
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=0
    )
    accuracy = accuracy_score(labels, predictions)

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

def create_training_arguments(model_name: str, training_data: Dict) -> TrainingArguments:
    """Create training arguments compatible with different transformers versions."""

    train_size = len(training_data.get('texts', []))
    model_output_dir = finetuned_models_path / f"{model_name.replace('/', '_')}_finetuned"

    # Adaptive training parameters
    if train_size < 50:
        num_epochs = 8
        batch_size = 4
        learning_rate = 3e-5
        warmup_steps = 10
    else:
        num_epochs = 5
        batch_size = 8
        learning_rate = 2e-5
        warmup_steps = 20

    print(f"Training configuration for {model_name}:")
    print(f"  Epochs: {num_epochs}")
    print(f"  Batch size: {batch_size}")
    print(f"  Learning rate: {learning_rate}")
    print(f"  Training samples: {train_size}")

    # Try different parameter configurations for compatibility
    configs_to_try = [
        # Modern transformers
        {
            'output_dir': str(model_output_dir),
            'num_train_epochs': num_epochs,
            'per_device_train_batch_size': batch_size,
            'per_device_eval_batch_size': batch_size,
            'learning_rate': learning_rate,
            'warmup_steps': warmup_steps,
            'weight_decay': 0.01,
            'logging_steps': 5,
            'evaluation_strategy': "epoch",
            'save_strategy': "epoch",
            'load_best_model_at_end': True,
            'metric_for_best_model': "accuracy",
            'greater_is_better': True,
            'save_total_limit': 2,
            'seed': SEED,
            'dataloader_pin_memory': False,
            'remove_unused_columns': False,
            'report_to': [],
        },
        # Older transformers
        {
            'output_dir': str(model_output_dir),
            'num_train_epochs': num_epochs,
            'per_device_train_batch_size': batch_size,
            'per_device_eval_batch_size': batch_size,
            'learning_rate': learning_rate,
            'warmup_steps': warmup_steps,
            'weight_decay': 0.01,
            'logging_steps': 5,
            'eval_strategy': "epoch",  # Old parameter name
            'save_strategy': "epoch",
            'load_best_model_at_end': True,
            'metric_for_best_model': "accuracy",
            'greater_is_better': True,
            'save_total_limit': 2,
            'seed': SEED,
            'dataloader_pin_memory': False,
            'remove_unused_columns': False,
        },
        # Minimal configuration
        {
            'output_dir': str(model_output_dir),
            'num_train_epochs': num_epochs,
            'per_device_train_batch_size': batch_size,
            'per_device_eval_batch_size': batch_size,
            'learning_rate': learning_rate,
            'warmup_steps': warmup_steps,
            'weight_decay': 0.01,
            'logging_steps': 5,
            'save_total_limit': 2,
            'seed': SEED,
        }
    ]

    # Try configurations until one works
    for i, config in enumerate(configs_to_try):
        try:
            training_args = TrainingArguments(**config)
            print(f"Successfully created training arguments (config {i+1})")
            return training_args
        except TypeError as e:
            print(f"Configuration {i+1} failed: {e}")
            continue

    raise ValueError("All training argument configurations failed")


In [None]:
## 8. FINE-TUNING FUNCTION

def fine_tune_model(model_name: str, training_data: Dict, validation_data: Dict) -> Dict:
    """Fine-tune a FinBERT model with comprehensive error handling."""

    print(f"\n" + "="*60)
    print(f"FINE-TUNING {model_name}")
    print("="*60)

    try:
        # Load tokenizer
        print("Loading tokenizer...")
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
        except Exception as e:
            print(f"Tokenizer loading failed: {e}")
            return {'error': f'Tokenizer loading failed: {str(e)}', 'model_name': model_name}

        # Configure tokenizer
        if tokenizer.pad_token is None:
            if tokenizer.eos_token:
                tokenizer.pad_token = tokenizer.eos_token
            elif tokenizer.unk_token:
                tokenizer.pad_token = tokenizer.unk_token
            else:
                tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        print(f"Tokenizer configured with pad_token: {tokenizer.pad_token}")

        # Load model
        print("Loading pre-trained model...")
        unique_labels = list(set(training_data['labels']))
        num_labels = len(unique_labels)

        try:
            model = AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=num_labels,
                ignore_mismatched_sizes=True
            )
        except Exception as e:
            print(f"Model loading failed: {e}")
            return {'error': f'Model loading failed: {str(e)}', 'model_name': model_name}

        print(f"Model loaded with {num_labels} labels: {unique_labels}")

        # Create datasets
        print("Creating datasets...")
        try:
            train_dataset = SentimentDataset(
                training_data['texts'],
                training_data['labels'],
                tokenizer,
                max_length=128  # Reduced for stability
            )

            val_dataset = SentimentDataset(
                validation_data['texts'],
                validation_data['labels'],
                tokenizer,
                max_length=128
            )

            # Ensure consistent label mappings
            val_dataset.label_to_id = train_dataset.label_to_id
            val_dataset.id_to_label = train_dataset.id_to_label

        except Exception as e:
            print(f"Dataset creation failed: {e}")
            return {'error': f'Dataset creation failed: {str(e)}', 'model_name': model_name}

        # Create training arguments
        try:
            training_args = create_training_arguments(model_name, training_data)
        except Exception as e:
            print(f"Training arguments creation failed: {e}")
            return {'error': f'Training arguments failed: {str(e)}', 'model_name': model_name}

        # Create trainer
        print("Creating trainer...")
        try:
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=tokenizer,
                data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                compute_metrics=compute_metrics,
            )
        except Exception as e:
            print(f"Trainer creation failed: {e}")
            return {'error': f'Trainer creation failed: {str(e)}', 'model_name': model_name}

        # Execute training
        print("Starting training...")
        try:
            train_result = trainer.train()
            print(f"Training completed successfully!")
            print(f"Final training loss: {train_result.training_loss:.4f}")
        except Exception as e:
            print(f"Training execution failed: {e}")
            return {'error': f'Training execution failed: {str(e)}', 'model_name': model_name}

        # Evaluation
        print("Evaluating model...")
        try:
            eval_results = trainer.evaluate()
            print(f"Evaluation completed:")
            for key, value in eval_results.items():
                if isinstance(value, float):
                    print(f"  {key}: {value:.4f}")
        except Exception as e:
            print(f"Evaluation failed: {e}")
            eval_results = {'eval_loss': 0.0, 'eval_accuracy': 0.0, 'eval_f1': 0.0}

        # Save model
        model_save_path = finetuned_models_path / f"{model_name.replace('/', '_')}_finetuned"
        print(f"Saving model to {model_save_path}...")
        try:
            model_save_path.mkdir(exist_ok=True)
            trainer.save_model(str(model_save_path))
            tokenizer.save_pretrained(str(model_save_path))
            print("Model saved successfully!")
        except Exception as e:
            print(f"Model saving failed: {e}")
            # Continue anyway as training was successful

        # Generate predictions for final validation
        try:
            predictions = trainer.predict(val_dataset)
            pred_labels = np.argmax(predictions.predictions, axis=1)
            true_labels = predictions.label_ids

            # Calculate final metrics
            final_accuracy = accuracy_score(true_labels, pred_labels)
            final_f1 = precision_recall_fscore_support(
                true_labels, pred_labels, average='weighted', zero_division=0
            )[2]

            print(f"\nFinal Validation Results:")
            print(f"  Accuracy: {final_accuracy:.4f}")
            print(f"  F1 Score: {final_f1:.4f}")

        except Exception as e:
            print(f"Final prediction generation failed: {e}")
            final_accuracy = eval_results.get('eval_accuracy', 0.0)
            final_f1 = eval_results.get('eval_f1', 0.0)

        # Compile results
        results = {
            'model_name': model_name,
            'training_samples': len(training_data['texts']),
            'validation_samples': len(validation_data['texts']),
            'training_loss': train_result.training_loss,
            'evaluation_metrics': {
                'eval_accuracy': final_accuracy,
                'eval_f1': final_f1,
                'eval_loss': eval_results.get('eval_loss', 0.0)
            },
            'model_save_path': str(model_save_path),
            'label_mapping': train_dataset.label_to_id,
            'training_completed': True
        }

        return results

    except Exception as e:
        print(f"Unexpected error in fine-tuning {model_name}: {str(e)}")
        import traceback
        traceback.print_exc()
        return {'error': str(e), 'model_name': model_name}


In [None]:
## 9. MAIN EXECUTION

def main():
    """Main execution function."""

    print("\n" + "="*60)
    print("STARTING MAIN EXECUTION")
    print("="*60)

    # Load or create training data
    train_df, val_df = load_or_create_training_data()
    training_data, validation_data = prepare_training_data(train_df, val_df)

    # Define models to fine-tune
    models_to_finetune = [
        'ProsusAI/finbert',
        'yiyanghkust/finbert-tone'
    ]

    print(f"\nModels to fine-tune: {len(models_to_finetune)}")
    for model in models_to_finetune:
        print(f"  - {model}")

    # Initialize results storage
    fine_tuning_results = {}

    # Fine-tune each model
    for i, model_name in enumerate(models_to_finetune, 1):
        print(f"\n" + "="*60)
        print(f"PROCESSING MODEL {i}/{len(models_to_finetune)}: {model_name}")
        print("="*60)

        try:
            results = fine_tune_model(model_name, training_data, validation_data)
            fine_tuning_results[model_name] = results

            if 'error' not in results:
                metrics = results.get('evaluation_metrics', {})
                print(f"SUCCESS: {model_name}")
                print(f"  F1 Score: {metrics.get('eval_f1', 0):.3f}")
                print(f"  Accuracy: {metrics.get('eval_accuracy', 0):.3f}")
                print(f"  Training Loss: {results.get('training_loss', 0):.3f}")
            else:
                print(f"FAILED: {model_name}")
                print(f"  Error: {results['error']}")

        except Exception as e:
            print(f"EXCEPTION: {model_name}")
            print(f"  Error: {str(e)}")
            fine_tuning_results[model_name] = {'error': str(e)}

    return fine_tuning_results, training_data, validation_data

# Execute main function
fine_tuning_results, training_data, validation_data = main()



STARTING MAIN EXECUTION

LOADING TRAINING DATA
Directory exists: /content/drive/MyDrive/CAM_DS_AI_Project/data/manual_validation/jpm
Files found:
  - sentiment_sentence_jpm_multi_2025.csv
  - manual_validation_report.json
Creating comprehensive sample data...

CREATING COMPREHENSIVE TRAINING DATA
Training data shape: (22, 2)
Validation data shape: (8, 2)

Label distributions:
Training: {'neutral': np.int64(8), 'positive': np.int64(8), 'negative': np.int64(6)}
Validation: {'negative': np.int64(4), 'neutral': np.int64(2), 'positive': np.int64(2)}

PREPARING TRAINING DATA
Training samples: 22
Validation samples: 8

Training label distribution:
  neutral: 8 (36.4%)
  positive: 8 (36.4%)
  negative: 6 (27.3%)
Class weights: {'negative': np.float64(1.2222222222222223), 'neutral': np.float64(0.9166666666666666), 'positive': np.float64(0.9166666666666666)}

Models to fine-tune: 2
  - ProsusAI/finbert
  - yiyanghkust/finbert-tone

PROCESSING MODEL 1/2: ProsusAI/finbert

FINE-TUNING ProsusAI/fi

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model loaded with 3 labels: ['negative', 'neutral', 'positive']
Creating datasets...
Dataset created with 22 samples
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Dataset created with 8 samples
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Training configuration for ProsusAI/finbert:
  Epochs: 8
  Batch size: 4
  Learning rate: 3e-05
  Training samples: 22
Configuration 1 failed: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
Successfully created training arguments (config 2)
Creating trainer...
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,3.2723,2.75611,0.125,0.071429,0.05,0.125
2,1.9871,2.181816,0.375,0.277778,0.321429,0.375
3,1.3903,1.479759,0.5,0.342857,0.266667,0.5
4,1.3756,0.884071,0.5,0.333333,0.25,0.5
5,0.4331,0.768994,0.625,0.635714,0.666667,0.625
6,0.343,0.669936,0.625,0.635714,0.666667,0.625
7,0.2841,0.630258,0.625,0.635714,0.666667,0.625
8,0.1617,0.630085,0.75,0.75,0.875,0.75


Training completed successfully!
Final training loss: 1.0424
Evaluating model...


Evaluation completed:
  eval_loss: 0.6301
  eval_accuracy: 0.7500
  eval_f1: 0.7500
  eval_precision: 0.8750
  eval_recall: 0.7500
  eval_runtime: 5.0371
  eval_samples_per_second: 1.5880
  eval_steps_per_second: 0.3970
  epoch: 8.0000
Saving model to /content/drive/MyDrive/CAM_DS_AI_Project/models/finetuned/ProsusAI_finbert_finetuned...
Model saved successfully!

Final Validation Results:
  Accuracy: 0.7500
  F1 Score: 0.7500
SUCCESS: ProsusAI/finbert
  F1 Score: 0.750
  Accuracy: 0.750
  Training Loss: 1.042

PROCESSING MODEL 2/2: yiyanghkust/finbert-tone

FINE-TUNING yiyanghkust/finbert-tone
Loading tokenizer...


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Tokenizer configured with pad_token: [PAD]
Loading pre-trained model...


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model loaded with 3 labels: ['negative', 'neutral', 'positive']
Creating datasets...
Dataset created with 22 samples
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Dataset created with 8 samples
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Training configuration for yiyanghkust/finbert-tone:
  Epochs: 8
  Batch size: 4
  Learning rate: 3e-05
  Training samples: 22
Configuration 1 failed: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'
Successfully created training arguments (config 2)
Creating trainer...
Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,12.4494,9.276988,0.25,0.166667,0.125,0.25
2,7.6665,1.296081,0.625,0.595238,0.625,0.625
3,1.1369,2.020897,0.25,0.111111,0.071429,0.25


In [None]:
## 10. RESULTS ANALYSIS AND COMPARISON

def analyze_results(fine_tuning_results: Dict):
    """Analyze and compare fine-tuning results."""

    print("\n" + "="*60)
    print("RESULTS ANALYSIS")
    print("="*60)

    successful_models = []
    failed_models = []

    for model_name, results in fine_tuning_results.items():
        if 'error' not in results:
            successful_models.append((model_name, results))
        else:
            failed_models.append((model_name, results['error']))

    print(f"Successfully trained: {len(successful_models)} models")
    print(f"Failed training: {len(failed_models)} models")

    # Display successful models
    if successful_models:
        print(f"\nSUCCESSFUL MODELS:")
        performance_data = []

        for model_name, results in successful_models:
            metrics = results.get('evaluation_metrics', {})
            f1 = metrics.get('eval_f1', 0)
            accuracy = metrics.get('eval_accuracy', 0)
            training_loss = results.get('training_loss', 0)

            performance_data.append({
                'model': model_name,
                'f1': f1,
                'accuracy': accuracy,
                'loss': training_loss
            })

            print(f"\n  {model_name}:")
            print(f"    F1 Score: {f1:.4f}")
            print(f"    Accuracy: {accuracy:.4f}")
            print(f"    Training Loss: {training_loss:.4f}")
            print(f"    Model Path: {results.get('model_save_path', 'N/A')}")

            # Performance assessment
            if f1 > 0.8:
                assessment = "Excellent"
            elif f1 > 0.7:
                assessment = "Good"
            elif f1 > 0.6:
                assessment = "Fair"
            else:
                assessment = "Poor"
            print(f"    Assessment: {assessment}")

        # Find best model
        if performance_data:
            best_model = max(performance_data, key=lambda x: x['f1'])
            print(f"\nBEST PERFORMING MODEL:")
            print(f"  Model: {best_model['model']}")
            print(f"  F1 Score: {best_model['f1']:.4f}")
            print(f"  Accuracy: {best_model['accuracy']:.4f}")

    # Display failed models
    if failed_models:
        print(f"\nFAILED MODELS:")
        for model_name, error in failed_models:
            print(f"  {model_name}: {error}")

    return successful_models, failed_models

# Analyze results
successful_models, failed_models = analyze_results(fine_tuning_results)


In [None]:
## 11. SAVE RESULTS

def save_comprehensive_results(fine_tuning_results: Dict, training_data: Dict, validation_data: Dict):
    """Save comprehensive results to files."""

    print("\n" + "="*50)
    print("SAVING RESULTS")
    print("="*50)

    # Create comprehensive results summary
    results_summary = {
        'fine_tuning_results': fine_tuning_results,
        'training_configuration': {
            'training_samples': len(training_data.get('texts', [])),
            'validation_samples': len(validation_data.get('texts', [])),
            'models_attempted': len(fine_tuning_results),
            'successful_models': len([r for r in fine_tuning_results.values() if 'error' not in r]),
            'failed_models': len([r for r in fine_tuning_results.values() if 'error' in r]),
            'timestamp': pd.Timestamp.now().isoformat(),
            'transformers_version': transformers.__version__,
            'pytorch_version': torch.__version__,
            'cuda_available': torch.cuda.is_available()
        },
        'model_paths': {
            name: results.get('model_save_path', 'N/A')
            for name, results in fine_tuning_results.items()
            if 'error' not in results
        }
    }

    # Save main results file
    results_file = finetuned_models_path / "complete_finetuning_results.json"
    try:
        with open(results_file, 'w') as f:
            json.dump(results_summary, f, indent=2, default=str)
        print(f"Results saved to: {results_file}")
    except Exception as e:
        print(f"Failed to save results: {e}")

    # Save model registry for easy loading
    successful_models = {
        name: {
            'path': results['model_save_path'],
            'performance': results['evaluation_metrics'],
            'label_mapping': results.get('label_mapping', {}),
            'training_samples': results.get('training_samples', 0)
        }
        for name, results in fine_tuning_results.items()
        if 'error' not in results and 'model_save_path' in results
    }

    if successful_models:
        registry_file = finetuned_models_path / "model_registry.json"
        try:
            with open(registry_file, 'w') as f:
                json.dump(successful_models, f, indent=2, default=str)
            print(f"Model registry saved to: {registry_file}")
        except Exception as e:
            print(f"Failed to save model registry: {e}")

    return results_file

# Save results
results_file = save_comprehensive_results(fine_tuning_results, training_data, validation_data)


In [None]:
## 12. FINAL SUMMARY

print("\n" + "="*70)
print("FINE-TUNING PROCESS COMPLETE")
print("="*70)

total_models = len(fine_tuning_results)
successful_count = len([r for r in fine_tuning_results.values() if 'error' not in r])
failed_count = total_models - successful_count

print(f"SUMMARY:")
print(f"  Total models attempted: {total_models}")
print(f"  Successfully trained: {successful_count}")
print(f"  Failed training: {failed_count}")

if successful_count > 0:
    print(f"\nSUCCESSFUL MODELS:")
    for model_name, results in fine_tuning_results.items():
        if 'error' not in results:
            metrics = results.get('evaluation_metrics', {})
            print(f"  {model_name}:")
            print(f"    F1: {metrics.get('eval_f1', 0):.3f}")
            print(f"    Accuracy: {metrics.get('eval_accuracy', 0):.3f}")
            print(f"    Path: {results.get('model_save_path', 'N/A')}")

    print(f"\nNEXT STEPS:")
    print(f"  1. Models are saved and ready for use")
    print(f"  2. Load models using: AutoModelForSequenceClassification.from_pretrained(model_path)")
    print(f"  3. Use tokenizer from same path: AutoTokenizer.from_pretrained(model_path)")
    print(f"  4. Apply to your financial sentiment analysis tasks")
else:
    print(f"\nNO MODELS SUCCESSFULLY TRAINED")
    print(f"Review the error messages above for troubleshooting")

if failed_count > 0:
    print(f"\nFAILED MODELS:")
    for model_name, results in fine_tuning_results.items():
        if 'error' in results:
            print(f"  {model_name}: {results['error']}")

print(f"\nAll files saved to: {finetuned_models_path}")
print(f"Results file: {results_file}")
print("\nProcess complete!")
