In [1]:
import torch
import torch.nn as nn
from transformers import (
    RobertaTokenizer,
    Wav2Vec2FeatureExtractor,
    AutoModel,
    RobertaModel,
    HubertModel
)
import torch.optim as optim
import copy
import os
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, f1_score

try:
    from utils import get_iemocap_data_loaders, collate_fn_raw, MetricsLogger
    from models import TextOnlyModel, AudioOnlyModel
    from trainer import Trainer
    print("Successfully imported custom modules (utils, models, trainer).")
except ImportError as e:
    print(f"Error importing custom modules: {e}")
    print("Please ensure utils.py, models.py, and trainer.py exist and contain the necessary definitions.")

Successfully imported custom modules (utils, models, trainer).


In [10]:
import torch

# --- General Configuration ---
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {DEVICE}')
torch.backends.cuda.matmul.allow_tf32 = False

# Emotion labels
EMOTION_LABELS = ['angry', 'frustrated', 'happy', 'sad', 'neutral']
NUM_CLASSES = len(EMOTION_LABELS)

# Model checkpoints
AUDIO_CHECKPOINT = 'facebook/hubert-base-ls960'
TEXT_CHECKPOINT = 'roberta-base'

# Data parameters
DATA_PATH = './iemocap'
PRECOMPUTED_FEATURES = False

# Training parameters
BATCH_SIZE = 4
NUM_WORKERS = 10
LEARNING_RATE = 3e-3
N_EPOCHS = 20
PATIENCE = 5
FREEZE_BASE_MODEL = True

print("\n--- Configuration ---")
print(f"Device: {DEVICE}")
print(f"Number of classes: {NUM_CLASSES}")
print(f"Freeze base model: {FREEZE_BASE_MODEL}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Epochs: {N_EPOCHS}")
print(f"Patience: {PATIENCE}")

Using device: cuda

--- Configuration ---
Device: cuda
Number of classes: 5
Freeze base model: True
Batch Size: 4
Learning Rate: 0.003
Epochs: 20
Patience: 5


In [3]:
import torch
from transformers import RobertaTokenizer, Wav2Vec2FeatureExtractor

# --- Load Tokenizer and Processor ---
print("\nLoading tokenizer and feature extractor...")
try:
    tokenizer = RobertaTokenizer.from_pretrained(TEXT_CHECKPOINT)
    processor = Wav2Vec2FeatureExtractor.from_pretrained(AUDIO_CHECKPOINT)
    print("Tokenizer and feature extractor loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer/processor: {e}")
    raise RuntimeError("Failed to load tokenizer/processor.") from e

# --- Load Data Loaders ---
collate_wrapper = lambda b: collate_fn_raw(b, tokenizer, processor)

print("\nLoading data loaders...")
try:
    train_loader, val_loader, test_loader = get_iemocap_data_loaders(
        path=DATA_PATH,
        precomputed=PRECOMPUTED_FEATURES,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        collate_fn=collate_wrapper,
    )
    print("Data loaders created.")
    # Verify data loaders
    print("Checking a sample batch from train_loader...")
    sample_batch = next(iter(train_loader))
    print("Sample batch keys:", sample_batch.keys())
    if 'text_inputs' in sample_batch:
        print("Text input_ids shape:", sample_batch['text_inputs'].get('input_ids', torch.empty(0)).shape)
    if 'audio_inputs' in sample_batch:
        print("Audio input_values shape:", sample_batch['audio_inputs'].get('input_values', torch.empty(0)).shape)
    if 'labels' in sample_batch:
        print("Labels shape:", sample_batch['labels'].shape)

    # Store loaders for the function
    data_loaders = {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }

except NameError:
     print("Error: `get_iemocap_data_loaders` or `collate_fn_raw` not found. Check imports.")
     raise RuntimeError("Data loading functions not found.")
except Exception as e:
    print(f"Error creating data loaders: {e}")
    raise RuntimeError("Failed to create data loaders.") from e


Loading tokenizer and feature extractor...
Tokenizer and feature extractor loaded successfully.

Loading data loaders...
Distribution after filtering:
neutral: 1726
frustrated: 2917
angry: 1269
sad: 1250
happy: 2632
train, val, test sizes: 7835, 979, 980
Data loaders created.
Checking a sample batch from train_loader...
Sample batch keys: dict_keys(['text_inputs', 'audio_inputs', 'labels'])
Text input_ids shape: torch.Size([64, 82])
Audio input_values shape: torch.Size([64, 290400])
Labels shape: torch.Size([64])


In [4]:
import torch
import torch.optim as optim
from transformers import AutoModel
import copy
import os
import traceback

def run_unimodal_training(model_type, device, config, data_loaders):
    """
    Encapsulates the training and evaluation process for a unimodal model.

    Args:
        model_type (str): 'text' or 'audio'.
        device (str): 'cuda' or 'cpu'.
        config (dict): Dictionary containing configuration parameters like
                       checkpoints, paths, training settings.
        data_loaders (dict): Dictionary containing 'train', 'val', 'test' DataLoaders.
    """
    print(f"\n{'='*20} Starting Training for {model_type.upper()} Model {'='*20}")

    # --- Configuration Specific to this Run ---
    if model_type == 'text':
        base_checkpoint = config['text_checkpoint']
        model_save_path = config['text_model_save_path']
        metrics_save_path = config['text_metrics_save_path']
        ModelClass = TextOnlyModel
    elif model_type == 'audio':
        base_checkpoint = config['audio_checkpoint']
        model_save_path = config['audio_model_save_path']
        metrics_save_path = config['audio_metrics_save_path']
        ModelClass = AudioOnlyModel
    else:
        raise ValueError("model_type must be 'text' or 'audio'")

    num_classes = config['num_classes']
    freeze_base = config['freeze_base_model']
    lr = config['learning_rate']
    n_epochs = config['n_epochs']
    patience = config['patience']

    # --- Instantiate Logger ---
    try:
        logger = MetricsLogger(save_path=metrics_save_path)
    except NameError:
        print("Error: MetricsLogger class not found. Check imports from utils.py.")
        return # Stop this run if logger is missing

    # --- Load Base Model ---
    print(f"Loading base pre-trained model: {base_checkpoint}...")
    try:
        base_model = AutoModel.from_pretrained(base_checkpoint)
    except Exception as e:
        print(f"Error loading base model {base_checkpoint}: {e}")
        return # Stop this run

    # --- Freeze Base Model Parameters ---
    if freeze_base:
        print("Freezing base model parameters.")
        for param in base_model.parameters():
            param.requires_grad = False
    else:
        print("Base model parameters will be fine-tuned.")

    # --- Instantiate Full Model ---
    print(f"Instantiating {model_type.capitalize()}OnlyModel...")
    hidden_size = base_model.config.hidden_size
    try:
        # Instantiate the specific model class
        if model_type == 'text':
             # Pass the actual loaded base_model instance
            model = ModelClass(roberta=base_model, num_classes=num_classes, hidden_size=hidden_size)
        elif model_type == 'audio':
             # Pass the actual loaded base_model instance
            model = ModelClass(hubert=base_model, num_classes=num_classes, hidden_size=hidden_size)
        print(f"{model_type.capitalize()}OnlyModel instantiated successfully.")
    except NameError:
         print(f"Error: {ModelClass.__name__} class not found. Check imports from models.py.")
         return # Stop this run if model class is missing
    except Exception as e:
        print(f"Error instantiating {ModelClass.__name__}: {e}")
        traceback.print_exc()
        return # Stop this run

    # --- Move Model to Device ---
    try:
        model.to(device)
    except Exception as e:
        print(f"Error moving model to device {device}: {e}")
        return # Stop this run

    # --- Set Up Optimizer ---
    params_to_optimize = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.AdamW(params_to_optimize, lr=lr)
    print(f"Optimizer: AdamW with LR={lr}")
    num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Number of trainable parameters: {num_trainable_params}")

    # --- Instantiate Trainer ---
    try:
        trainer = Trainer(model, optimizer, device=device)
    except NameError:
        print("Error: Trainer class not found. Check imports from trainer.py.")
        return # Stop this run

    # --- Training Loop ---
    best_val_loss = float('inf')
    best_model_state = None
    early_stopping_counter = 0

    print(f"\nStarting training for {n_epochs} epochs...")
    for epoch in range(1, n_epochs + 1):
        try:
            # Train
            train_loss, train_acc, train_f1 = trainer.train_one_epoch(data_loaders['train'], epoch)
            # Validate
            val_loss, val_acc, val_f1 = trainer.evaluate(data_loaders['val'], desc="Validate")

            print(f'\nEpoch {epoch}/{n_epochs}')
            print(f'  [Train] Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}')
            print(f'  [Val]   Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}')

            # Log metrics
            logger.log_train(train_loss, train_acc, train_f1)
            logger.log_val(val_loss, val_acc, val_f1)
            logger.save()

            # Check for improvement and save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = copy.deepcopy(trainer.model.state_dict())
                early_stopping_counter = 0
                print(f'  -> Validation loss improved to {best_val_loss:.4f}. Saving model state.')
                try:
                     torch.save({
                        'epoch': epoch,
                        'model_state_dict': best_model_state,
                        'optimizer_state_dict': optimizer.state_dict(),
                        'val_loss': best_val_loss,
                        'config': {
                             'model_type': model_type,
                             'num_classes': num_classes,
                             'hidden_size': hidden_size,
                             'base_checkpoint': base_checkpoint
                        }
                    }, model_save_path)
                     print(f"  -> Best model checkpoint saved to '{model_save_path}'")
                except Exception as e_save:
                    print(f"  -> Error saving model checkpoint: {e_save}")
            else:
                early_stopping_counter += 1
                print(f'  -> Validation loss did not improve. Counter: {early_stopping_counter}/{patience}')

            # Early stopping
            if early_stopping_counter >= patience:
                print(f'\nEarly stopping triggered after {epoch} epochs.')
                break
        except Exception as e_epoch:
            print(f"\nError during epoch {epoch}: {e_epoch}")
            traceback.print_exc()
            print("Stopping training for this model type due to error.")
            return # Stop this run

    print("\nTraining finished.")

    # --- Final Evaluation on Test Set ---
    print(f"\n--- Evaluating Best {model_type.upper()} Model on Test Set ---")
    if best_model_state is None:
         print("Warning: No best model state was saved (validation loss might not have improved). Evaluating last model state.")
         # Ensure the model used by the trainer is used for evaluation
         eval_model = trainer.model # Use the model instance from the trainer
         eval_model.eval() # Set to eval mode
    elif not os.path.exists(model_save_path):
        print(f"Warning: Best model checkpoint '{model_save_path}' not found, but best state exists in memory. Evaluating model from memory.")
        # Need to load the state into the current model instance
        try:
            model.load_state_dict(best_model_state)
            eval_model = model
            eval_model.eval()
        except Exception as e_load_mem:
             print(f"Error loading best model state from memory: {e_load_mem}. Cannot evaluate.")
             return
    else:
        print(f"Loading best model state from '{model_save_path}'...")
        try:
            checkpoint = torch.load(model_save_path, map_location=device)
            # Re-initialize architecture - important if script restarted or for consistency
            print("Re-initializing model architecture for evaluation...")
            eval_config = checkpoint.get('config', {})
            eval_num_classes = eval_config.get('num_classes', num_classes)
            eval_base_checkpoint = eval_config.get('base_checkpoint', base_checkpoint)
            eval_base_model = AutoModel.from_pretrained(eval_base_checkpoint)
            eval_hidden_size = eval_config.get('hidden_size', eval_base_model.config.hidden_size)

            if model_type == 'text':
                 eval_model = TextOnlyModel(roberta=eval_base_model, num_classes=eval_num_classes, hidden_size=eval_hidden_size)
            elif model_type == 'audio':
                 eval_model = AudioOnlyModel(hubert=eval_base_model, num_classes=eval_num_classes, hidden_size=eval_hidden_size)

            eval_model.load_state_dict(checkpoint['model_state_dict'])
            eval_model.to(device)
            eval_model.eval()
            print("Best model loaded successfully from file.")

        except FileNotFoundError:
            print(f"Error: Checkpoint file not found at {model_save_path}. Cannot evaluate.")
            return
        except Exception as e_load:
            print(f"An error occurred loading the best model: {e_load}")
            traceback.print_exc()
            return

    # Use the existing Trainer instance structure for evaluation, but with the loaded/best model
    # Create a dummy optimizer as Trainer expects one, but it won't be used for eval
    eval_optimizer = optim.AdamW(eval_model.parameters(), lr=1e-5)
    eval_trainer = Trainer(eval_model, eval_optimizer, device=device)

    print("\nRunning evaluation on the test set...")
    try:
        test_loss, test_acc, test_f1 = eval_trainer.evaluate(data_loaders['test'], desc="Test")

        print("\n--- Test Set Results ---")
        print(f"  Loss: {test_loss:.4f}")
        print(f"  Accuracy: {test_acc:.4f}")
        print(f"  Weighted F1-Score: {test_f1:.4f}")

        # Log test results
        logger.log_test(test_loss, test_acc, test_f1)
        logger.save()
        print(f"Test metrics logged and saved to {logger.save_path}")

    except Exception as e_eval:
        print(f"An error occurred during final evaluation: {e_eval}")
        traceback.print_exc()

    print(f"\n{'='*20} Finished Run for {model_type.upper()} Model {'='*20}")


print("Training and evaluation function defined.")

Training and evaluation function defined.


In [5]:
# --- Define Configuration for the Run ---
# Using variables defined in Block 2
run_config_text = {
    'text_checkpoint': TEXT_CHECKPOINT,
    'audio_checkpoint': AUDIO_CHECKPOINT, # Keep both for potential future use in config
    'text_model_save_path': 'best_text_only_model.pth',
    'text_metrics_save_path': 'text_only_training_metrics.json',
    'audio_model_save_path': 'best_audio_only_model.pth', # Keep both for consistency
    'audio_metrics_save_path': 'audio_only_training_metrics.json',
    'num_classes': NUM_CLASSES,
    'freeze_base_model': FREEZE_BASE_MODEL,
    'learning_rate': LEARNING_RATE,
    'n_epochs': N_EPOCHS,
    'patience': PATIENCE,
}

# --- Run Training for Text Model ---
try:
    # Pass necessary variables: model type, device, config dict, data loaders dict
    run_unimodal_training(
        model_type='text',
        device=DEVICE,
        config=run_config_text,
        data_loaders=data_loaders # data_loaders defined in Block 3
    )
except NameError as e:
    print(f"Error calling training function for TEXT model: Required variable not defined ({e}). Check previous blocks.")
except Exception as e:
    print(f"An unexpected error occurred during TEXT model training execution: {e}")
    traceback.print_exc()


Loading base pre-trained model: roberta-base...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Freezing base model parameters.
Instantiating TextOnlyModel...
TextOnlyModel initialized (compatible version) with hidden_size=768, num_classes=5
TextOnlyModel instantiated successfully.
Optimizer: AdamW with LR=0.003
Number of trainable parameters: 1185029

Starting training for 20 epochs...


Epoch 1 [Train]: 100%|██████████| 123/123 [00:19<00:00,  6.24it/s, loss=1.51]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.94it/s]



Epoch 1/20
  [Train] Loss: 1.5650 | Acc: 0.2850 | F1: 0.2149
  [Val]   Loss: 1.5150 | Acc: 0.3524 | F1: 0.2558
  -> Validation loss improved to 1.5150. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 2 [Train]: 100%|██████████| 123/123 [00:13<00:00,  9.30it/s, loss=1.56]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.83it/s]



Epoch 2/20
  [Train] Loss: 1.5207 | Acc: 0.3313 | F1: 0.2546
  [Val]   Loss: 1.4852 | Acc: 0.3391 | F1: 0.2467
  -> Validation loss improved to 1.4852. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 3 [Train]: 100%|██████████| 123/123 [00:16<00:00,  7.28it/s, loss=1.43]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.91it/s]



Epoch 3/20
  [Train] Loss: 1.4704 | Acc: 0.3706 | F1: 0.3089
  [Val]   Loss: 1.4044 | Acc: 0.3933 | F1: 0.2985
  -> Validation loss improved to 1.4044. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 4 [Train]: 100%|██████████| 123/123 [00:15<00:00,  8.02it/s, loss=1.42]
Validate: 100%|██████████| 16/16 [00:06<00:00,  2.37it/s]



Epoch 4/20
  [Train] Loss: 1.4407 | Acc: 0.3778 | F1: 0.3242
  [Val]   Loss: 1.3731 | Acc: 0.4229 | F1: 0.3703
  -> Validation loss improved to 1.3731. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 5 [Train]: 100%|██████████| 123/123 [00:12<00:00,  9.92it/s, loss=1.59]
Validate: 100%|██████████| 16/16 [00:03<00:00,  5.01it/s]



Epoch 5/20
  [Train] Loss: 1.4209 | Acc: 0.3871 | F1: 0.3491
  [Val]   Loss: 1.3713 | Acc: 0.4208 | F1: 0.3800
  -> Validation loss improved to 1.3713. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 6 [Train]: 100%|██████████| 123/123 [00:15<00:00,  7.88it/s, loss=1.56]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.98it/s]



Epoch 6/20
  [Train] Loss: 1.4089 | Acc: 0.3911 | F1: 0.3623
  [Val]   Loss: 1.3216 | Acc: 0.4454 | F1: 0.3774
  -> Validation loss improved to 1.3216. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 7 [Train]: 100%|██████████| 123/123 [00:14<00:00,  8.58it/s, loss=1.55]
Validate: 100%|██████████| 16/16 [00:04<00:00,  3.88it/s]



Epoch 7/20
  [Train] Loss: 1.3995 | Acc: 0.4036 | F1: 0.3799
  [Val]   Loss: 1.3547 | Acc: 0.4382 | F1: 0.4232
  -> Validation loss did not improve. Counter: 1/5


Epoch 8 [Train]: 100%|██████████| 123/123 [00:15<00:00,  7.91it/s, loss=1.31]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.05it/s]



Epoch 8/20
  [Train] Loss: 1.3945 | Acc: 0.3995 | F1: 0.3770
  [Val]   Loss: 1.3038 | Acc: 0.4556 | F1: 0.4106
  -> Validation loss improved to 1.3038. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 9 [Train]: 100%|██████████| 123/123 [00:13<00:00,  9.41it/s, loss=1.3] 
Validate: 100%|██████████| 16/16 [00:04<00:00,  3.70it/s]



Epoch 9/20
  [Train] Loss: 1.3817 | Acc: 0.4031 | F1: 0.3781
  [Val]   Loss: 1.3338 | Acc: 0.4454 | F1: 0.4205
  -> Validation loss did not improve. Counter: 1/5


Epoch 10 [Train]: 100%|██████████| 123/123 [00:17<00:00,  6.84it/s, loss=1.44]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.93it/s]



Epoch 10/20
  [Train] Loss: 1.3903 | Acc: 0.3969 | F1: 0.3750
  [Val]   Loss: 1.3101 | Acc: 0.4484 | F1: 0.4316
  -> Validation loss did not improve. Counter: 2/5


Epoch 11 [Train]: 100%|██████████| 123/123 [00:13<00:00,  9.32it/s, loss=1.52]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.87it/s]



Epoch 11/20
  [Train] Loss: 1.3743 | Acc: 0.4092 | F1: 0.3890
  [Val]   Loss: 1.2692 | Acc: 0.4709 | F1: 0.4369
  -> Validation loss improved to 1.2692. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 12 [Train]: 100%|██████████| 123/123 [00:12<00:00,  9.91it/s, loss=1.39]
Validate: 100%|██████████| 16/16 [00:03<00:00,  5.15it/s]



Epoch 12/20
  [Train] Loss: 1.3733 | Acc: 0.4105 | F1: 0.3848
  [Val]   Loss: 1.2874 | Acc: 0.4719 | F1: 0.4311
  -> Validation loss did not improve. Counter: 1/5


Epoch 13 [Train]: 100%|██████████| 123/123 [00:12<00:00,  9.65it/s, loss=1.3] 
Validate: 100%|██████████| 16/16 [00:06<00:00,  2.56it/s]



Epoch 13/20
  [Train] Loss: 1.3780 | Acc: 0.4040 | F1: 0.3800
  [Val]   Loss: 1.2655 | Acc: 0.4668 | F1: 0.4276
  -> Validation loss improved to 1.2655. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 14 [Train]: 100%|██████████| 123/123 [00:12<00:00,  9.87it/s, loss=1.18]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.82it/s]



Epoch 14/20
  [Train] Loss: 1.3689 | Acc: 0.4133 | F1: 0.3919
  [Val]   Loss: 1.3023 | Acc: 0.4484 | F1: 0.3834
  -> Validation loss did not improve. Counter: 1/5


Epoch 15 [Train]: 100%|██████████| 123/123 [00:12<00:00,  9.98it/s, loss=1.48]
Validate: 100%|██████████| 16/16 [00:06<00:00,  2.54it/s]



Epoch 15/20
  [Train] Loss: 1.3689 | Acc: 0.4156 | F1: 0.3912
  [Val]   Loss: 1.2474 | Acc: 0.4699 | F1: 0.4355
  -> Validation loss improved to 1.2474. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 16 [Train]: 100%|██████████| 123/123 [00:12<00:00, 10.08it/s, loss=1.52]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.44it/s]



Epoch 16/20
  [Train] Loss: 1.3577 | Acc: 0.4181 | F1: 0.3976
  [Val]   Loss: 1.2932 | Acc: 0.4637 | F1: 0.4330
  -> Validation loss did not improve. Counter: 1/5


Epoch 17 [Train]: 100%|██████████| 123/123 [00:15<00:00,  8.03it/s, loss=1.43]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.43it/s]



Epoch 17/20
  [Train] Loss: 1.3645 | Acc: 0.4103 | F1: 0.3882
  [Val]   Loss: 1.2738 | Acc: 0.4821 | F1: 0.4380
  -> Validation loss did not improve. Counter: 2/5


Epoch 18 [Train]: 100%|██████████| 123/123 [00:14<00:00,  8.61it/s, loss=1.13]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.86it/s]



Epoch 18/20
  [Train] Loss: 1.3727 | Acc: 0.4071 | F1: 0.3828
  [Val]   Loss: 1.2576 | Acc: 0.4699 | F1: 0.4461
  -> Validation loss did not improve. Counter: 3/5


Epoch 19 [Train]: 100%|██████████| 123/123 [00:15<00:00,  7.76it/s, loss=1.35]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.68it/s]



Epoch 19/20
  [Train] Loss: 1.3555 | Acc: 0.4205 | F1: 0.3987
  [Val]   Loss: 1.2167 | Acc: 0.4934 | F1: 0.4459
  -> Validation loss improved to 1.2167. Saving model state.
  -> Best model checkpoint saved to 'best_text_only_model.pth'


Epoch 20 [Train]: 100%|██████████| 123/123 [00:15<00:00,  8.05it/s, loss=1.36]
Validate: 100%|██████████| 16/16 [00:03<00:00,  4.22it/s]



Epoch 20/20
  [Train] Loss: 1.3481 | Acc: 0.4228 | F1: 0.4025
  [Val]   Loss: 1.2956 | Acc: 0.4505 | F1: 0.3932
  -> Validation loss did not improve. Counter: 1/5

Training finished.

--- Evaluating Best TEXT Model on Test Set ---
Loading best model state from 'best_text_only_model.pth'...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Re-initializing model architecture for evaluation...
TextOnlyModel initialized (compatible version) with hidden_size=768, num_classes=5
Best model loaded successfully from file.

Running evaluation on the test set...


Test: 100%|██████████| 16/16 [00:06<00:00,  2.55it/s]


--- Test Set Results ---
  Loss: 1.2648
  Accuracy: 0.4867
  Weighted F1-Score: 0.4440
Test metrics logged and saved to text_only_training_metrics.json






In [11]:
# Free up memory from previous model training
import torch
import gc

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Run garbage collection to free up memory
gc.collect()

print("Memory cleared from previous model training.")


# --- Define Configuration for the Run ---
# Re-use most config, paths are handled inside the function based on model_type
run_config_audio = {
    'text_checkpoint': TEXT_CHECKPOINT,
    'audio_checkpoint': AUDIO_CHECKPOINT,
    'text_model_save_path': 'best_text_only_model.pth',
    'text_metrics_save_path': 'text_only_training_metrics.json',
    'audio_model_save_path': 'best_audio_only_model.pth',
    'audio_metrics_save_path': 'audio_only_training_metrics.json',
    'num_classes': NUM_CLASSES,
    'freeze_base_model': FREEZE_BASE_MODEL,
    'learning_rate': LEARNING_RATE,
    'n_epochs': N_EPOCHS,
    'patience': PATIENCE,
}

# --- Run Training for Audio Model ---
try:
     # Pass necessary variables: model type, device, config dict, data loaders dict
    run_unimodal_training(
        model_type='audio',
        device=DEVICE,
        config=run_config_audio,
        data_loaders=data_loaders # data_loaders defined in Block 3
    )
except NameError as e:
    print(f"Error calling training function for AUDIO model: Required variable not defined ({e}). Check previous blocks.")
except Exception as e:
    print(f"An unexpected error occurred during AUDIO model training execution: {e}")
    traceback.print_exc()

print("\n--- All Training Runs Complete ---")

Memory cleared from previous model training.

Loading base pre-trained model: facebook/hubert-base-ls960...
Freezing base model parameters.
Instantiating AudioOnlyModel...
AudioOnlyModel initialized (compatible version) with hidden_size=768, num_classes=5
AudioOnlyModel instantiated successfully.
Optimizer: AdamW with LR=0.003
Number of trainable parameters: 1185029

Starting training for 20 epochs...


Epoch 1 [Train]:   0%|          | 0/123 [00:07<?, ?it/s]


Error during epoch 1: CUDA out of memory. Tried to allocate 3.52 GiB. GPU 0 has a total capacity of 10.00 GiB of which 0 bytes is free. Of the allocated memory 21.57 GiB is allocated by PyTorch, and 54.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Stopping training for this model type due to error.

--- All Training Runs Complete ---



Traceback (most recent call last):
  File "/tmp/ipykernel_13444/3535036055.py", line 114, in run_unimodal_training
    train_loss, train_acc, train_f1 = trainer.train_one_epoch(data_loaders['train'], epoch)
                                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/mnt/i/Github/csci535_project/trainer.py", line 35, in train_one_epoch
    loss, preds, labels = self.step(batch)
                          ^^^^^^^^^^^^^^^^
  File "/mnt/i/Github/csci535_project/trainer.py", line 22, in step
    logits = self.model(text_inputs, audio_inputs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/aaaab/anaconda3/envs/535ml/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/aaaab/anaconda3/envs/535ml/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_ca