In [1]:
import pandas as pd
import numpy as np
import torch
import os
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Why Add Auxiliary Features?

Linear probes typically train on model activations alone: `[N, D]`

By adding auxiliary features like topic, we enrich the feature space: `[N, D+F]`

This allows the probe to learn:
- **Pure activation patterns** (what the model represents internally)
- **Topic-specific biases** (some topics are inherently harder)
- **Interaction effects** (how activations interact with topic difficulty)

The linear probe will learn optimal weights for both sources of information.

## Function: Add Topic to Activation Files

In [2]:
def add_topic_to_activations(activation_path, sr_dataset_path, output_path):
    """
    Add topic_num as auxiliary features to an existing activation file.
    
    Args:
        activation_path: Path to existing .pt activation file
        sr_dataset_path: Path to parquet file with topic information
        output_path: Where to save the enriched activation file
    
    Returns:
        data: The enriched activation dictionary
    """
    # Load activations
    print(f"Loading activations from {activation_path}")
    data = torch.load(activation_path)
    
    # Load SR dataset with topic info
    print(f"Loading topic data from {sr_dataset_path}")
    df = pd.read_parquet(sr_dataset_path)
    
    # If topic doesn't exist, add it from MATH dataset
    if 'topic' not in df.columns:
        print("Topic not found, loading from MATH dataset...")
        math_ds = load_dataset("DigitalLearningGmbH/MATH-lighteval", "default")
        split = "train" if "train" in sr_dataset_path else "test"
        df["topic"] = math_ds[split]["type"]
        df["topic_num"] = pd.factorize(df["topic"])[0]
    
    # Ensure topic_num exists
    if 'topic_num' not in df.columns:
        df["topic_num"] = pd.factorize(df["topic"])[0]
    
    # Extract topic_num as auxiliary features
    topic_nums = torch.tensor(df["topic_num"].values, dtype=torch.float32).reshape(-1, 1)
    
    # Add to activation data
    data['aux_features'] = topic_nums
    data['aux_feature_names'] = ['topic_num']
    
    # Save enriched activations
    print(f"Saving enriched activations to {output_path}")
    torch.save(data, output_path)
    print(f"✓ Added aux_features with shape {topic_nums.shape}")
    print(f"✓ Feature names: {data['aux_feature_names']}")
    
    return data

## Example Usage: Add Topic to Activation Files

In [12]:
# Configure paths for your activation files
MODEL_NAME = "Qwen-Qwen2.5-Math-1.5B-Instruct"
MAXLEN = 3000
K = 5
TEMP = 1.0

# Paths
ACTIVATION_DIR = "../DATA/activations"
SR_DATA_DIR = "../DATA/SR_DATA/MATH"

# Train set
TRAIN_ACTIVATION_PATH = f"{ACTIVATION_DIR}/{MODEL_NAME}_maxlen_{MAXLEN}_k_{K}_temp_{TEMP}_train.pt"
TRAIN_SR_PATH = f"{SR_DATA_DIR}/train-{MODEL_NAME}_maxlen_{MAXLEN}_k_{K}_temp_{TEMP}.parquet"
TRAIN_OUTPUT_PATH = f"{ACTIVATION_DIR}/{MODEL_NAME}_maxlen_{MAXLEN}_k_{K}_temp_{TEMP}_train_with_topic.pt"

# Test set
TEST_ACTIVATION_PATH = f"{ACTIVATION_DIR}/{MODEL_NAME}_maxlen_{MAXLEN}_k_{K}_temp_{TEMP}_test.pt"
TEST_SR_PATH = f"{SR_DATA_DIR}/test-{MODEL_NAME}_maxlen_{MAXLEN}_k_{K}_temp_{TEMP}.parquet"
TEST_OUTPUT_PATH = f"{ACTIVATION_DIR}/{MODEL_NAME}_maxlen_{MAXLEN}_k_{K}_temp_{TEMP}_test_with_topic.pt"

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Max length: {MAXLEN}")
print(f"  K samples: {K}")
print(f"  Temperature: {TEMP}")

Configuration:
  Model: Qwen-Qwen2.5-Math-1.5B-Instruct
  Max length: 3000
  K samples: 5
  Temperature: 1.0


In [13]:
# Process train set
print("="*60)
print("PROCESSING TRAIN SET")
print("="*60)

if os.path.exists(TRAIN_ACTIVATION_PATH):
    train_data_enriched = add_topic_to_activations(
        activation_path=TRAIN_ACTIVATION_PATH,
        sr_dataset_path=TRAIN_SR_PATH,
        output_path=TRAIN_OUTPUT_PATH
    )
    print("\n✓ Train set processing complete!\n")
else:
    print(f"ERROR: Activation file not found: {TRAIN_ACTIVATION_PATH}")

PROCESSING TRAIN SET
Loading activations from ../DATA/activations/Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0_train.pt
Loading topic data from ../DATA/SR_DATA/MATH/train-Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0.parquet
Loading topic data from ../DATA/SR_DATA/MATH/train-Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0.parquet
Saving enriched activations to ../DATA/activations/Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0_train_with_topic.pt
Saving enriched activations to ../DATA/activations/Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0_train_with_topic.pt
✓ Added aux_features with shape torch.Size([7500, 1])
✓ Feature names: ['topic_num']
✓ Added aux_features with shape torch.Size([7500, 1])
✓ Feature names: ['topic_num']

✓ Train set processing complete!


✓ Train set processing complete!



In [14]:
# Process test set
print("="*60)
print("PROCESSING TEST SET")
print("="*60)

if os.path.exists(TEST_ACTIVATION_PATH):
    test_data_enriched = add_topic_to_activations(
        activation_path=TEST_ACTIVATION_PATH,
        sr_dataset_path=TEST_SR_PATH,
        output_path=TEST_OUTPUT_PATH
    )
    print("\n✓ Test set processing complete!\n")
else:
    print(f"ERROR: Activation file not found: {TEST_ACTIVATION_PATH}")

PROCESSING TEST SET
Loading activations from ../DATA/activations/Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0_test.pt
Loading topic data from ../DATA/SR_DATA/MATH/test-Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0.parquet
Loading topic data from ../DATA/SR_DATA/MATH/test-Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0.parquet
Saving enriched activations to ../DATA/activations/Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0_test_with_topic.pt
Saving enriched activations to ../DATA/activations/Qwen-Qwen2.5-Math-1.5B-Instruct_maxlen_3000_k_5_temp_1.0_test_with_topic.pt
✓ Added aux_features with shape torch.Size([5000, 1])
✓ Feature names: ['topic_num']
✓ Added aux_features with shape torch.Size([5000, 1])
✓ Feature names: ['topic_num']

✓ Test set processing complete!


✓ Test set processing complete!



## Verify the Enriched Activation Files

In [15]:
# Load and inspect the enriched files
if os.path.exists(TRAIN_OUTPUT_PATH):
    enriched_data = torch.load(TRAIN_OUTPUT_PATH)
    
    print("Enriched activation file contents:")
    print(f"  Keys: {list(enriched_data.keys())}")
    print(f"\nActivations shape: {enriched_data['activations'].shape}")
    print(f"Labels shape: {enriched_data['labels'].shape}")
    print(f"Aux features shape: {enriched_data['aux_features'].shape}")
    print(f"Aux feature names: {enriched_data['aux_feature_names']}")
    
    print(f"\nTopic distribution (first 20 samples):")
    print(enriched_data['aux_features'][:20].flatten().numpy())

    print(f"\nTopic distribution (last 20 samples):")
    print(enriched_data['aux_features'][-20:].flatten().numpy())

Enriched activation file contents:
  Keys: ['activations', 'labels', 'layer_indices', 'positions', 'n_eoi_tokens', 'd_model', 'model_name', 'aux_features', 'aux_feature_names']

Activations shape: torch.Size([7500, 29, 5, 1536])
Labels shape: torch.Size([7500])
Aux features shape: torch.Size([7500, 1])
Aux feature names: ['topic_num']

Topic distribution (first 20 samples):
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Topic distribution (last 20 samples):
[6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6. 6.]


## Function: Add Multiple Auxiliary Features

You can extend this to add multiple features at once (e.g., topic, question_length, difficulty_level)

In [None]:
def add_multiple_features_to_activations(activation_path, sr_dataset_path, output_path, feature_names):
    """
    Add multiple auxiliary features to activation file.
    
    Args:
        activation_path: Path to existing .pt activation file
        sr_dataset_path: Path to parquet file with features
        output_path: Where to save the enriched activation file
        feature_names: List of column names to include (e.g., ['topic_num', 'difficulty_level'])
    
    Returns:
        data: The enriched activation dictionary
    """
    # Load activations and dataset
    print(f"Loading activations from {activation_path}")
    data = torch.load(activation_path)
    
    print(f"Loading features from {sr_dataset_path}")
    df = pd.read_parquet(sr_dataset_path)
    
    # Extract features
    aux_features_list = []
    valid_feature_names = []
    
    for feat in feature_names:
        if feat not in df.columns:
            print(f"Warning: {feat} not found in dataset, skipping")
            continue
        aux_features_list.append(df[feat].values.reshape(-1, 1))
        valid_feature_names.append(feat)
    
    if aux_features_list:
        aux_features = np.concatenate(aux_features_list, axis=1)
        data['aux_features'] = torch.tensor(aux_features, dtype=torch.float32)
        data['aux_feature_names'] = valid_feature_names
        
        print(f"\n✓ Added {len(valid_feature_names)} auxiliary features: {valid_feature_names}")
        print(f"✓ Aux features shape: {data['aux_features'].shape}")
    else:
        print("ERROR: No valid features found!")
        return None
    
    # Save
    print(f"Saving to {output_path}")
    torch.save(data, output_path)
    
    return data

## Example: Add Multiple Features

Uncomment and modify to add custom features from your dataset:

In [None]:
# Example: Add both topic_num and custom difficulty score
# First, you'd need to compute additional features in your SR dataset

# # Load SR dataset and add custom features
# train_df = pd.read_parquet(TRAIN_SR_PATH)
# 
# # Example: Create a difficulty score based on question/solution length
# train_df['question_length'] = train_df['question'].str.len()
# train_df['solution_length'] = train_df['solution'].str.len()
# 
# # Save back
# train_df.to_parquet(TRAIN_SR_PATH)
# 
# # Now add multiple features to activations
# enriched_multi = add_multiple_features_to_activations(
#     activation_path=TRAIN_ACTIVATION_PATH,
#     sr_dataset_path=TRAIN_SR_PATH,
#     output_path=TRAIN_OUTPUT_PATH.replace('.pt', '_multi_features.pt'),
#     feature_names=['topic_num', 'question_length', 'solution_length']
# )

## Training Probes with Enriched Features

Once the enriched activation files ahve been created, use them with `train_probe.py`:

```bash
python scripts/train_probe.py \
  --train_activations activations/model_train_with_topic.pt \
  --test_activations activations/model_test_with_topic.pt \
  --output_dir results/with_topic \
  --k_fold \
  --alpha_grid "0,0.001,0.01,0.1,1,10,100,1000" \
  --wandb \
  --wandb_project "probe_with_topic" \
  --wandb_name "qwen2.5_math_with_topic"
```

The modified `train_probe.py` will:
1. Automatically detect the `aux_features` in the .pt file
2. Concatenate them with activations: `[N, D] + [N, F] → [N, D+F]`
3. Train the linear probe on the enriched feature space
4. Log the auxiliary feature usage to wandb

**Expected result:** Better probe performance due to additional topic information!

## Comparing Probes With and Without Topic

To measure the impact of adding topic information:

1. Train probe WITHOUT topic (original activation files)
2. Train probe WITH topic (enriched activation files)
3. Compare test performance (ROC-AUC or Spearman correlation)

The difference shows how much topic information helps beyond pure activations!