# Phase 1, Step 3: Deploy (Training & Evaluation)

**Objective:** To execute the full continual learning experiment, producing the core results that demonstrate the HGC architecture's resistance to catastrophic forgetting.

This notebook will:
1.  **Train Baseline & HGC Models on Task A:** Establish initial expertise.
2.  **Update Models with Task B:** Use costly retraining for the baseline and near-zero-cost superposition for the HGC model.
3.  **Evaluate All Models:** Test all model versions on both Task A and Task B test sets to quantify knowledge retention and acquisition.

The final output will be a results table showing the perplexity scores, which will form the empirical basis for our paper and thesis chapter.

## 1. Setup and Configuration

Load libraries, datasets, and define the training configuration. We'll set up everything needed for both models.

In [None]:
import torch
import pandas as pd
import numpy as np
import os
import sys
import math
from tqdm.notebook import tqdm

from datasets import Dataset
from transformers import (
    BertForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

# Add project root to path to import our custom modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from hgc_core.hgc_module import HolographicKnowledgeManifold
from notebooks.p1s2 import BertForMaskedLM_With_HGC # Assuming p1s2.ipynb contains the class

# --- Configuration ---
MODEL_NAME = "bert-base-uncased"
HKM_DIMENSIONALITY = 4096
DATA_DIR = "../data/"
RESULTS_DIR = "../results/models/"

# GPU Performance Maximization
BATCH_SIZE = 8  # Adjust based on your VRAM
GRAD_ACCUM_STEPS = 4  # Effectively batch size of 32
EPOCHS = 3

# Ensure results directory exists
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"CUDA available: {torch.cuda.is_available()}")

### 1.1 Load and Tokenize Datasets

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def load_and_tokenize(task_name):
    datasets = {}
    for split in ['train', 'val', 'test']:
        df = pd.read_parquet(os.path.join(DATA_DIR, f"{task_name}_{split}.parquet"))
        # Convert pandas DataFrame to Hugging Face Dataset
        hg_dataset = Dataset.from_pandas(df)
        # Tokenize
        tokenized_dataset = hg_dataset.map(
            lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128),
            batched=True, 
            remove_columns=['text', 'source']
        )
        datasets[split] = tokenized_dataset
    return datasets

print("Loading and tokenizing Task A...")
task_a_datasets = load_and_tokenize('task_a')
print("Loading and tokenizing Task B...")
task_b_datasets = load_and_tokenize('task_b')

# Data collator will dynamically handle masking for the MLM task
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
print("\nData preparation complete.")

## 2. Part 1: Initial Training on Task A

We train both the baseline and HGC models on the broad knowledge from Task A. For the HGC model, we use a custom Trainer to populate the HKM during training.

In [None]:
# Define Training Arguments for all trainers
training_args = TrainingArguments(
    output_dir=os.path.join(RESULTS_DIR, 'training_checkpoints'),
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    fp16=torch.cuda.is_available(), # Enable mixed-precision
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Custom Trainer for HGC model
class HGCTrainer(Trainer):
    def training_step(self, model, inputs):
        # Standard forward pass and loss calculation
        loss = super().training_step(model, inputs)
        
        # HGC-specific step: update the manifold
        with torch.no_grad():
            # We need to get the projected vectors from the model output
            outputs = model(**inputs)
            projected_vectors = outputs['projected_vectors'].detach()
            model.hkm.add_to_manifold(projected_vectors)
        
        return loss

print("--- Training Baseline Model on Task A ---")
baseline_model_a = BertForMaskedLM.from_pretrained(MODEL_NAME)
trainer_baseline = Trainer(
    model=baseline_model_a,
    args=training_args,
    train_dataset=task_a_datasets['train'],
    eval_dataset=task_a_datasets['val'],
    data_collator=data_collator,
)
trainer_baseline.train()
baseline_model_a.save_pretrained(os.path.join(RESULTS_DIR, 'baseline_task_a'))
print("\nBaseline model trained on Task A and saved.")

print("\n--- Training HGC Model on Task A ---")
hgc_model_a = BertForMaskedLM_With_HGC(MODEL_NAME, HKM_DIMENSIONALITY)
trainer_hgc = HGCTrainer(
    model=hgc_model_a,
    args=training_args,
    train_dataset=task_a_datasets['train'],
    eval_dataset=task_a_datasets['val'],
    data_collator=data_collator,
)
trainer_hgc.train()
torch.save(hgc_model_a.state_dict(), os.path.join(RESULTS_DIR, 'hgc_task_a.pt'))
print("\nHGC model trained on Task A and saved.")

## 3. Part 2: Knowledge Update with Task B

Here we simulate the knowledge update. The baseline model is retrained, incurring significant cost. The HGC model is updated via fast, computationally cheap superposition.

In [None]:
print("--- Updating Baseline Model with Task B (Re-training) ---")
# The baseline trainer is already configured. We just call train again.
trainer_baseline.train_dataset = task_b_datasets['train']
trainer_baseline.eval_dataset = task_b_datasets['val']
trainer_baseline.train()
trainer_baseline.model.save_pretrained(os.path.join(RESULTS_DIR, 'baseline_task_ab'))
print("\nBaseline model updated with Task B and saved.")

print("\n--- Updating HGC Model with Task B (Superposition) ---")
hgc_model_ab = BertForMaskedLM_With_HGC(MODEL_NAME, HKM_DIMENSIONALITY)
hgc_model_ab.load_state_dict(torch.load(os.path.join(RESULTS_DIR, 'hgc_task_a.pt')))
hgc_model_ab.eval() # Set model to evaluation mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hgc_model_ab.to(device)

update_loader = torch.utils.data.DataLoader(task_b_datasets['train'], batch_size=BATCH_SIZE)

with torch.no_grad():
    for batch in tqdm(update_loader, desc="HGC Superposition Update"):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = hgc_model_ab(**inputs)
        projected_vectors = outputs['projected_vectors'].detach()
        hgc_model_ab.hkm.add_to_manifold(projected_vectors)

torch.save(hgc_model_ab.state_dict(), os.path.join(RESULTS_DIR, 'hgc_task_ab.pt'))
print("\nHGC model updated with Task B and saved.")

## 4. Part 3: Final Evaluation & Results

This is the final step. We evaluate all four model states (pre- and post-update) on both test sets. The results will definitively show the effect of catastrophic forgetting on the baseline and the knowledge retention of the HGC model.

In [None]:
results = []
model_paths = {
    "Baseline (Task A only)": ('baseline_task_a', False),
    "HGC (Task A only)": ('hgc_task_a.pt', True),
    "Baseline (Updated on B)": ('baseline_task_ab', False),
    "HGC (Updated on B)": ('hgc_task_ab.pt', True)
}

def evaluate_model(model, dataset):
    trainer = Trainer(
        model=model,
        args=TrainingArguments(output_dir="./eval_tmp", per_device_eval_batch_size=BATCH_SIZE),
        data_collator=data_collator
    )
    eval_results = trainer.evaluate(eval_dataset=dataset)
    return math.exp(eval_results['eval_loss'])

for model_name, (path, is_hgc) in model_paths.items():
    print(f"\n--- Evaluating: {model_name} ---")
    full_path = os.path.join(RESULTS_DIR, path)
    
    if is_hgc:
        model = BertForMaskedLM_With_HGC(MODEL_NAME, HKM_DIMENSIONALITY)
        model.load_state_dict(torch.load(full_path))
    else:
        model = BertForMaskedLM.from_pretrained(full_path)
    
    # Evaluate on Task A test set
    ppl_a = evaluate_model(model, task_a_datasets['test'])
    print(f"  Perplexity on Task A Test Set: {ppl_a:.4f}")
    
    # Evaluate on Task B test set
    ppl_b = evaluate_model(model, task_b_datasets['test'])
    print(f"  Perplexity on Task B Test Set: {ppl_b:.4f}")
    
    results.append({
        'Model': model_name,
        'Perplexity on Task A (Lower is Better)': ppl_a,
        'Perplexity on Task B (Lower is Better)': ppl_b
    })

results_df = pd.DataFrame(results)


In [None]:
print("\n--- FINAL RESULTS ---")
display(results_df)

# Save results to a CSV for the paper/thesis
results_df.to_csv(os.path.join(RESULTS_DIR, 'phase1_final_results.csv'), index=False)
print("\nResults saved to ../results/models/phase1_final_results.csv")