# Boundary Scorer Training

Train a neural model to predict semantic boundary scores (0-6) using XLM-RoBERTa.

**Data**: 9,773 labeled boundaries from Gemini teacher
**Model**: XLM-R-base with classification head (7 classes)
**Context**: Â±5 sentences around each boundary
**Loss**: Weighted CrossEntropyLoss (handles class imbalance)

## 1. Setup

In [None]:
# Clone the repository
!git clone https://github.com/HBBobo/Intelligent-Chunking.git
%cd Intelligent-Chunking

In [None]:
# Install dependencies
!pip install -q transformers torch scipy scikit-learn tqdm

In [None]:
# Mount Google Drive (for saving models)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json
import random
import sys
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# Add src to path for imports
sys.path.insert(0, '.')

# Import dp_chunk_document here so it's available throughout notebook
from src.training.evaluate import dp_chunk_document

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Load Data

In [None]:
# Data paths (from cloned repo)
BOUNDARIES_PATH = Path('data/processed/all_training_data.jsonl')
SENTENCES_DIR = Path('data/processed/sentences')

# Verify data exists
assert BOUNDARIES_PATH.exists(), f"Boundaries file not found: {BOUNDARIES_PATH}"
assert SENTENCES_DIR.exists(), f"Sentences dir not found: {SENTENCES_DIR}"

# Count data
with open(BOUNDARIES_PATH) as f:
    n_boundaries = sum(1 for _ in f)
n_docs = len(list(SENTENCES_DIR.glob('*.json')))

print(f'Boundaries: {n_boundaries}')
print(f'Documents: {n_docs}')

## 3. Dataset & DataLoader

In [None]:
# Import from our training module
from src.training.dataset import BoundaryDataset, get_doc_splits

In [None]:
# Configuration
MODEL_NAME = 'xlm-roberta-base'
CONTEXT_SIZE = 5
MAX_LENGTH = 512
BATCH_SIZE = 16
LEARNING_RATE = 2e-5  # Reduced for stability
EPOCHS = 5
SEED = 42

# Classification parameters
FREEZE_LAYERS = 9  # Freeze 9 of 12 layers to reduce overfitting
DROPOUT = 0.3  # Higher dropout for regularization
NUM_CLASSES = 7  # Scores 0-6

# Set seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Split documents into train/val/test
train_ids, val_ids, test_ids = get_doc_splits(SENTENCES_DIR, seed=SEED)
print(f'Train: {len(train_ids)} docs, Val: {len(val_ids)} docs, Test: {len(test_ids)} docs')

In [None]:
# Create datasets
train_dataset = BoundaryDataset(
    BOUNDARIES_PATH, SENTENCES_DIR, tokenizer,
    context_size=CONTEXT_SIZE, doc_ids=set(train_ids)
)
val_dataset = BoundaryDataset(
    BOUNDARIES_PATH, SENTENCES_DIR, tokenizer,
    context_size=CONTEXT_SIZE, doc_ids=set(val_ids)
)
test_dataset = BoundaryDataset(
    BOUNDARIES_PATH, SENTENCES_DIR, tokenizer,
    context_size=CONTEXT_SIZE, doc_ids=set(test_ids)
)

print(f'Train samples: {len(train_dataset)}')
print(f'Val samples: {len(val_dataset)}')
print(f'Test samples: {len(test_dataset)}')

In [None]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## 4. Model

In [None]:
# Import model from our training module
from src.training.model import BoundaryScorer

In [None]:
# Initialize model with classification head
model = BoundaryScorer(
    MODEL_NAME,
    freeze_layers=FREEZE_LAYERS,
    dropout=DROPOUT,
    num_classes=NUM_CLASSES
)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'Total parameters: {total_params:,}')
print(f'Trainable parameters: {trainable_params:,}')
print(f'Frozen layers: {FREEZE_LAYERS}/12')

## 5. Training

In [None]:
# Import evaluation function
from src.training.evaluate import evaluate as evaluate_model_fn

def evaluate_model(model, loader):
    """Wrapper for our evaluate function."""
    metrics = evaluate_model_fn(model, loader, device)
    return metrics

In [None]:
# Compute class weights for imbalanced data
from src.training.trainer import compute_class_weights

class_weights = compute_class_weights(BOUNDARIES_PATH, NUM_CLASSES)
print(f'Class weights: {class_weights.tolist()}')
print(f'Weights normalized: high scores get ~{class_weights[5]:.1f}x more weight than score 1')

# Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * 0.1),
    num_training_steps=total_steps
)

# Weighted classification loss for 7 classes (scores 0-6)
criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))

In [None]:
# Training loop
history = {'train_loss': [], 'val_loss': [], 'val_pearson': []}
best_val_loss = float('inf')
best_state = None

for epoch in range(EPOCHS):
    model.train()
    train_losses = []

    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        scores = batch['score'].to(device)  # Integer class labels

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)  # [batch, 7]
        loss = criterion(logits, scores)  # CrossEntropyLoss expects logits and integer labels
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        train_losses.append(loss.item())
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    # Validation
    val_metrics = evaluate_model(model, val_loader)

    avg_train_loss = np.mean(train_losses)
    history['train_loss'].append(avg_train_loss)
    history['val_loss'].append(val_metrics['mse'])
    history['val_pearson'].append(val_metrics['pearson'])

    print(f"\nEpoch {epoch+1}: train_loss={avg_train_loss:.4f}, "
          f"val_mse={val_metrics['mse']:.4f}, val_pearson={val_metrics['pearson']:.4f}")

    # Save best model
    if val_metrics['mse'] < best_val_loss:
        best_val_loss = val_metrics['mse']
        best_state = model.state_dict().copy()

# Restore best model
if best_state:
    model.load_state_dict(best_state)

## 6. Evaluation

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['val_loss'], label='Val')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('MSE Loss')
axes[0].legend()
axes[0].set_title('Loss')

axes[1].plot(history['val_pearson'])
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Pearson Correlation')
axes[1].set_title('Validation Correlation')

plt.tight_layout()
plt.show()

In [None]:
# Final evaluation on test set
model.eval()
preds, targets = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Get expected value prediction (soft prediction)
        logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=-1)
        classes = torch.arange(NUM_CLASSES, device=device).float()
        pred = (probs * classes).sum(dim=-1)
        
        preds.extend(pred.cpu().numpy())
        targets.extend(batch['score'].numpy())

preds = np.array(preds)
targets = np.array(targets).astype(float)

test_metrics = {
    'pearson': pearsonr(preds, targets)[0],
    'spearman': spearmanr(preds, targets)[0],
    'mse': mean_squared_error(targets, preds),
    'mae': mean_absolute_error(targets, preds)
}

print('Test Set Results:')
print(f"  Pearson correlation: {test_metrics['pearson']:.4f}")
print(f"  Spearman correlation: {test_metrics['spearman']:.4f}")
print(f"  MSE: {test_metrics['mse']:.4f}")
print(f"  MAE: {test_metrics['mae']:.4f}")

In [None]:
# Histogram comparison
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

bins = np.arange(0, 7, 0.5)

axes[0].hist(targets, bins=bins, alpha=0.7, label='Teacher')
axes[0].hist(preds, bins=bins, alpha=0.7, label='Model')
axes[0].set_xlabel('Score')
axes[0].set_ylabel('Count')
axes[0].legend()
axes[0].set_title('Score Distribution')

# Scatter plot
axes[1].scatter(targets, preds, alpha=0.3)
axes[1].plot([0, 6], [0, 6], 'r--', label='Perfect')
axes[1].set_xlabel('Teacher Score')
axes[1].set_ylabel('Model Score')
axes[1].set_title('Prediction vs Target')
axes[1].legend()

plt.tight_layout()
plt.show()

## 7. DP Chunking Demo

In [None]:
# dp_chunk_document is already imported in Cell 5

In [None]:
# Demo on a test document
demo_doc_id = test_ids[0] if test_ids else list(test_dataset.sentences.keys())[0]
demo_sents = test_dataset.sentences[demo_doc_id]

print(f'Document: {demo_doc_id}')
print(f'Sentences: {len(demo_sents)}')

# Get predictions for this document
model.eval()
demo_scores = []

for i in range(len(demo_sents) - 1):
    left = demo_sents[max(0, i - CONTEXT_SIZE + 1):i + 1]
    right = demo_sents[i + 1:min(len(demo_sents), i + 1 + CONTEXT_SIZE)]
    text = ' '.join(left) + f' {tokenizer.sep_token} ' + ' '.join(right)

    encoding = tokenizer(text, max_length=MAX_LENGTH, truncation=True,
                         padding='max_length', return_tensors='pt')

    with torch.no_grad():
        logits = model(
            encoding['input_ids'].to(device),
            encoding['attention_mask'].to(device)
        )
        # Use expected value for smoother scores
        probs = torch.softmax(logits, dim=-1)
        classes = torch.arange(NUM_CLASSES, device=device).float()
        pred = (probs * classes).sum(dim=-1)
        demo_scores.append(pred.item())

demo_scores = np.array(demo_scores)
print(f'\nPredicted scores (first 20): {demo_scores[:20].round(1)}')

In [None]:
# Run DP chunking
chunks = dp_chunk_document(demo_sents, demo_scores.tolist())
print(f'\nChunks: {len(chunks)}')

# Display first 3 chunks
for i, (start, end) in enumerate(chunks[:3]):
    print(f'\n{"="*60}')
    print(f'CHUNK {i+1} (sentences {start+1}-{end})')
    print('='*60)
    for j in range(start, min(end, start + 5)):
        sent = demo_sents[j][:80] + '...' if len(demo_sents[j]) > 80 else demo_sents[j]
        print(f'  [{j+1}] {sent}')
    if end - start > 5:
        print(f'  ... ({end - start - 5} more sentences)')
    if end - 1 < len(demo_scores):
        print(f'  -- SPLIT (score: {demo_scores[end-1]:.1f}) --')

## 8. Save Model

In [None]:
# Save to Google Drive
SAVE_PATH = '/content/drive/MyDrive/ChunkingNN/models/boundary_scorer_v1'
!mkdir -p "{SAVE_PATH}"

# Save model weights
torch.save(model.state_dict(), f'{SAVE_PATH}/model.pt')

# Save tokenizer
tokenizer.save_pretrained(SAVE_PATH)

# Save config
config = {
    'model_name': MODEL_NAME,
    'context_size': CONTEXT_SIZE,
    'max_length': MAX_LENGTH,
    'freeze_layers': 6,
    'test_pearson': float(test_metrics['pearson']),
    'test_mse': float(test_metrics['mse'])
}
with open(f'{SAVE_PATH}/config.json', 'w') as f:
    json.dump(config, f, indent=2)

print(f'Model saved to {SAVE_PATH}')

In [None]:
# To load later:
# from src.training.model import BoundaryScorer
# model = BoundaryScorer('xlm-roberta-base', freeze_layers=6)
# model.load_state_dict(torch.load(f'{SAVE_PATH}/model.pt'))
# model.eval()