In [4]:
# HypotheSAEs Quickstart
# This notebook demonstrates basic usage of HypotheSAEs on a sample of the Yelp review dataset

%load_ext autoreload
%autoreload 2

import os
os.environ['OPENAI_KEY_SAE'] = 'EMPTY' # Replace with your OpenAI API key, or with another environment variable (e.g. os.environ['OPENAI_API_
import numpy as np
import pandas as pd

from hypothesaes.quickstart import train_sae, interpret_sae, generate_hypotheses, evaluate_hypotheses
from hypothesaes.embedding import get_openai_embeddings, get_local_embeddings

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**Load data**

The dataset we're using here is a subset of 20K Yelp reviews, with 2K reviews used for validation (during SAE training). 

The target variable is the `stars` column, which is a rating between 1 and 5. We treat this as a regression task.

There are also 2K reviews used for holdout evaluation, which we'll use at the end of the notebook.

In [17]:
current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"
val_ratio = 0.1  # Ratio of training data to use for validation
import sklearn
import sklearn.model_selection
few_shot_examples = 5
# base_dir = os.path.join(prefix, "demo-data")
# train_df = pd.read_json(os.path.join(base_dir, "yelp-demo-train-20K.json"), lines=True)
# val_df = pd.read_json(os.path.join(base_dir, "yelp-demo-val-2K.json"), lines=True)

# texts = train_df['text'].tolist()
# labels = train_df['stars'].values
# val_texts = val_df['text'].tolist() # These are only used for early stopping of SAE training, so we don't need labels.
from utils import df_to_prompts
base_dir = os.path.join(prefix, 'data')
train_X = pd.read_csv(os.path.join(base_dir, "X_train.csv"))
train_y = pd.read_csv(os.path.join(base_dir, "y_train.csv")).values.ravel()
test_X = pd.read_csv(os.path.join(base_dir, "X_test.csv"))
test_y = pd.read_csv(os.path.join(base_dir, "y_test.csv")).values.ravel()
number_dict = {'home': 0, 'other': 1, 'died': 2}
label_train = [number_dict[label] for label in train_y]
label_test = [number_dict[label] for label in test_y]
few_shot_row = train_X.iloc[0:few_shot_examples, :]
few_shot_label = label_train[0:few_shot_examples]
train_texts = df_to_prompts(few_shot_row, few_shot_label, train_X.iloc[few_shot_examples:, :], few_shot_examples=few_shot_examples)
texts, val_texts, labels, val_labels = sklearn.model_selection.train_test_split(
    train_texts, label_train[few_shot_examples:], test_size=val_ratio, random_state=42, shuffle=True
)

**Compute text embeddings for your dataset**

We'll compute text embeddings for a training set, and optionally a validation set. The validation embeddings are used for SAE eval and early-stopping during training.

Embeddings will be stored in the `emb_cache` directory (or `os.environ["EMB_CACHE_DIR"]` if you set it) using the `cache_name` parameter, so you only need to compute embeddings once.

You can use OpenAI or a local model.

Local models will run much faster on GPU. The default local model is `nomic-ai/modernbert-embed-base`. You can use any sentence-transformers model, but please read the model's docs; you may need to edit `get_local_embeddings`.

In [18]:
EMBEDDER = "Qwen/Qwen3-Embedding-0.6B" # OpenAI
# EMBEDDER = "nomic-ai/modernbert-embed-base" # Huggingface model, will run locally
CACHE_NAME = f"yelp_quickstart_{EMBEDDER}"

# text2embedding = get_openai_embeddings(texts + val_texts, model=EMBEDDER, cache_name=CACHE_NAME)
text2embedding = get_local_embeddings(texts + val_texts, model=EMBEDDER, batch_size=128, cache_name=CACHE_NAME)
embeddings = np.stack([text2embedding[text] for text in texts])

train_embeddings = np.stack([text2embedding[text] for text in texts])
val_embeddings = np.stack([text2embedding[text] for text in val_texts])

Loading embedding chunks:   0%|          | 0/2 [00:00<?, ?it/s]

Loading embedding chunks: 100%|██████████| 2/2 [00:01<00:00,  1.83it/s]


Loaded 23848 embeddings in 1.1s


**Train SAE(s)** 

Using different values of $M$ and $k$ will produce features at different levels of granularity. You can train multiple SAEs if you'd like to produce features at varying granularity, but this is optional.

See the README for more details about selecting $M$ and $k$.

In [19]:
checkpoint_dir = os.path.join(prefix, "checkpoints", CACHE_NAME)
sae_256_8 = train_sae(embeddings=train_embeddings, M=256, K=8, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_32_4 = train_sae(embeddings=train_embeddings, M=32, K=4, checkpoint_dir=checkpoint_dir, val_embeddings=val_embeddings)
sae_list = [sae_256_8, sae_32_4]

Loaded model from ./checkpoints/yelp_quickstart_Qwen/Qwen3-Embedding-0.6B/SAE_M=256_K=8.pt onto device cuda
Loaded model from ./checkpoints/yelp_quickstart_Qwen/Qwen3-Embedding-0.6B/SAE_M=32_K=4.pt onto device cuda


**Interpret neurons**  

Interpret a random subset of neurons in the SAE to sanity-check that the learned features, and their interpretations, seem reasonable. We generate and print labels for `n_random_neurons` neurons, and we also print out the top-activating texts for each neuron.

In [20]:
# This instruction will be included in the neuron interpretation prompt.
# The below instructions are specific to Yelp, but you can customize this for your task.
# If you don't pass in task-specific instructions, there is a generic instruction (see src/interpret_neurons.py);
# task-specific instructions are optional, but they help produce hypotheses at the desired level of specificity.

TASK_SPECIFIC_INSTRUCTIONS = """All of the texts are reviews of restaurants on Yelp.
Features should describe a specific aspect of the review. For example:
- "mentions long wait times to receive service"
- "praises how a dish was cooked, with phrases like 'perfect medium-rare'\""""
print(texts[0])  # Print an example text to see the format
print(train_embeddings[0])  # Print the corresponding embedding to see the format
print(sae_list[0])
# Interpret random neurons
results = interpret_sae(
    texts=texts,
    embeddings=train_embeddings,
    sae=sae_list,
    n_random_neurons=100,
    print_examples_n=3,
    task_specific_instructions=TASK_SPECIFIC_INSTRUCTIONS
)

You are a medical assistant. Based on the patient's personal and medical admission information, predict the discharge location.

Here are some examples:

Input: Gender: Female, Race: black/african american, Age: 43, Height: 60.0, Weight: 260.0, Bmi: 50.8, Marital status: divorced, Number of records: 0, Insurance: other, Language: english, Admit type: other, Admit location: other, Length of stay: 3, Systolic bp: 127.20712694877506, Diastolic bp: 74.14996288047513
Output: 0

Input: Gender: Male, Race: asian, Age: 35, Height: 67.0, Weight: 175.0, Bmi: 27.4, Marital status: single, Number of records: 0, Insurance: other, Language: english, Admit type: other, Admit location: other, Length of stay: 1, Systolic bp: 118.0, Diastolic bp: 76.0
Output: 1

Input: Gender: Female, Race: black/african american, Age: 47, Height: 63.5, Weight: 158.0, Bmi: 27.5, Marital status: widowed, Number of records: 0, Insurance: other, Language: english, Admit type: other, Admit location: other, Length of stay: 0

Computing activations (batchsize=16384): 100%|██████████| 1/1 [00:00<00:00, 375.83it/s]
Computing activations (batchsize=16384): 100%|██████████| 1/1 [00:00<00:00, 407.13it/s]


Activations shape: (1683, 288)


Generating 1 interpretation(s) per neuron:   0%|          | 0/100 [00:00<?, ?it/s]



Generating 1 interpretation(s) per neuron: 100%|██████████| 100/100 [00:00<00:00, 108.93it/s]


Neuron 56 (from SAE M=256, K=8): None

Top activating examples:
1. You are a medical assistant. Based on the patient's personal and medical admission information, predict the discharge location.  Here are some examples:  Input: Gender: Female, Race: black/african american, Age: 43, Height: 60.0, Weight: 260.0, Bmi: 50.8, Marital status: divorced, Number of records: 0, Insurance: other, Language: english, Admit type: other, Admit location: other, Length of stay: 3, Systolic bp: 127.20712694877506, Diastolic bp: 74.14996288047513 Output: 0  Input: Gender: Male, Race: asian, Age: 35, Height: 67.0, Weight: 175.0, Bmi: 27.4, Marital status: single, Number of records: 0, Insurance: other, Language: english, Admit type: other, Admit location: other, Length of stay: 1, Systolic bp: 118.0, Diastolic bp: 76.0 Output: 1  Input: Gender: Female, Race: black/african american, Age: 47, Height: 63.5, Weight: 158.0, Bmi: 27.5, Marital status: widowed, Number of records: 0, Insurance: other, Language: 




**Generate hypotheses**

Generate hypotheses which are predictive of the target variable.

The `selection_method` parameter defines how we compute neuron predictiveness (see `src/select_neurons.py` for more details):
- "separation_score": E[target | top-activating examples] - E[target | zero-activating examples]
- "correlation": pearson(neuron activations, target variable)
- "lasso": select N nonzero features with an L1 regularized model

This cell outputs a dataframe with the following columns:
- `neuron_idx`: The index of the neuron in the SAE (if you're using multiple SAEs, this will be a global index across all of them).
- `source_sae`: The SAE that the neuron was selected from.
- `target_{selection_method}`: The predictiveness of the neuron for the target variable, using the selected `selection_method`.
- `interpretation`: The natural language interpretation of the neuron.
- `interp_fidelity_score`: The F1 fidelity score for how well the neuron's interpretation actually corresponds to its activation pattern.

In [21]:
print(len(texts), len(label_train), len(train_embeddings))

1683 1876 1683


In [None]:
selection_method = "correlation"
results = generate_hypotheses(
    texts=texts,
    labels=labels,
    embeddings=embeddings,
    sae=sae_list,
    cache_name=CACHE_NAME,
    selection_method=selection_method,
    n_selected_neurons=20,
    n_candidate_interpretations=1,
    task_specific_instructions=TASK_SPECIFIC_INSTRUCTIONS
)

print("\nMost predictive features of Yelp reviews:")
pd.set_option('display.max_colwidth', None)
display(results.sort_values(by=f"target_{selection_method}", ascending=False))
pd.reset_option('display.max_colwidth')

NameError: name 'label' is not defined

**Evaluate held-out generalization**

Finally, we evaluate whether these are good hypotheses by testing whether their natural language interpretations can predict the target variable.  

We compute annotations for each hypothesized concept on a holdout set (not seen during SAE training & feature selection).

After annotation, we output a dataframe with the following columns:
- `hypothesis`: The natural language hypothesis (which came from interpreting a predictive neuron in the SAE)
- `separation_score`: How much the target variable differs when the concept is present vs. absent (i.e., $E[Y\mid\text{concept} = 1] - E[Y\mid\text{concept} = 0]$).
- `separation_pvalue`: The t-test p-value of the null hypothesis that the separation score is 0 (i.e., the concept is not associated with the target variable).
- `regression_coef`: The coefficient of the concept in a multivariate linear regression of the target variable on all concepts.
- `regression_pval`: The p-value of the null hypothesis that the regression coefficient is 0.
- `feature_prevalence`: The fraction of examples that contain the concept.

Additionally, we output the evaluation metrics used in the paper:
- Significant hypotheses: the number of hypotheses that are significant in the multivariate regression at a specified significance level (default $0.1$) after Bonferroni correction. You can pass in a different significance level using the `corrected_pval_threshold` parameter.
- AUC or $R^2$: how well the hypotheses collectively predict the target variable in the multivariate regression.


In [None]:
holdout_df = pd.read_json(os.path.join(base_dir, "yelp-demo-holdout-2K.json"), lines=True)
holdout_texts = holdout_df['text'].tolist()
holdout_labels = holdout_df['stars'].values

metrics, evaluation_df = evaluate_hypotheses(
    hypotheses_df=results,
    texts=holdout_texts,
    labels=holdout_labels,
    cache_name=CACHE_NAME,
)

pd.set_option('display.max_colwidth', None)
display(evaluation_df)
pd.reset_option('display.max_colwidth')

print("\nHoldout Set Metrics:")
print(f"R² Score: {metrics['r2']:.3f}")
print(f"Significant hypotheses: {metrics['Significant'][0]}/{metrics['Significant'][1]} " 
      f"(p < {metrics['Significant'][2]:.3e})")