In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import ast
import seaborn as sns
import json
import matplotlib.pyplot as plt
from datetime import datetime
from IPython.display import display, Latex
from datasets import load_dataset
from datetime import datetime

import wandb
api = wandb.Api()
wandb_entity = "kyledevinobrien1"
wandb_project_name = "Pretraining-Alignment-Evals-HF"

  from .autonotebook import tqdm as notebook_tqdm


# Configure Experiments

### Continue Pretraining Baseline

In [2]:
world_size = 64
nodes = world_size / 4
sequence_length = 2048
micro_batch_size = 32
gradient_accumulation_steps = 1
number_of_tokens = world_size * micro_batch_size * gradient_accumulation_steps * sequence_length
print(f"Effective Batch Size: {number_of_tokens} across {world_size} GPUs across {nodes} nodes")

#  🚀 View run at https://wandb.ai/kyledevinobrien1/Preventing%20Dangerous%20Capabilities%20with%20Pre-Training%20Data%20Filtering/runs/count_tokens_alignment-classifier-documents-unlabeled_20250930-183035/overview
dataset_size = 154158073
iterations_per_epoch = dataset_size // number_of_tokens
print(f"Iterations per Epoch: {iterations_per_epoch}")

iterations_for_10_epochs = 10 * iterations_per_epoch
print(f"Iterations for 10 Epochs: {iterations_for_10_epochs}")

estimated_epoch_hours = (8.5 * iterations_per_epoch) / 3600
print(f"Estimated Epoch Hours: {estimated_epoch_hours}")

full_training_run_hours = 8.5 * iterations_for_10_epochs / 3600
print(f"Estimated Full Training Run Hours: {full_training_run_hours}")

Effective Batch Size: 4194304 across 64 GPUs across 16.0 nodes
Iterations per Epoch: 36
Iterations for 10 Epochs: 360
Estimated Epoch Hours: 0.085
Estimated Full Training Run Hours: 0.85


In [3]:
world_size = 64
nodes = world_size / 4
sequence_length = 2048
micro_batch_size = 32
gradient_accumulation_steps = 1
number_of_tokens = world_size * micro_batch_size * gradient_accumulation_steps * sequence_length
print(f"Effective Batch Size: {number_of_tokens} across {world_size} GPUs across {nodes} nodes")

#  🚀 View run at https://wandb.ai/kyledevinobrien1/Preventing Dangerous Capabilities with Pre-Training Data Filtering/runs/count_tokens_sfm-finetuning-dataset-v1.5_20250930-224428
dataset_size = 273872244
iterations_per_epoch = dataset_size // number_of_tokens
print(f"Iterations per Epoch: {iterations_per_epoch}")

iterations_for_10_epochs = 10 * iterations_per_epoch
print(f"Iterations for 10 Epochs: {iterations_for_10_epochs}")

estimated_epoch_hours = (8.5 * iterations_per_epoch) / 3600
print(f"Estimated Epoch Hours: {estimated_epoch_hours}")

full_training_run_hours = 8.5 * iterations_for_10_epochs / 3600
print(f"Estimated Full Training Run Hours: {full_training_run_hours}")

Effective Batch Size: 4194304 across 64 GPUs across 16.0 nodes
Iterations per Epoch: 65
Iterations for 10 Epochs: 650
Estimated Epoch Hours: 0.15347222222222223
Estimated Full Training Run Hours: 1.5347222222222223


# Analyze Results

In [None]:
filter_dict = { "$and": [
        {"created_at": {"$gt": "2025-09-15T00:00:00"}},
        {"state": "finished"}
    ]}
runs = api.runs(wandb_project_name, filters=filter_dict)
baseline_model_name = "EleutherAI/deep-ignorance-unfiltered"
eval_records = []
for run in tqdm(runs, desc=wandb_project_name):
    try:
        model_name = run.name.split("_global_step")[0]
        is_baseline_mode = baseline_model_name in run.name
        if "_v1_5" not in model_name and not is_baseline_mode:
            continue

        checkpoint = 0 if is_baseline_mode else int(float(run.name.split("global_step")[1]))
        run_metrics = json.loads(run.summary_metrics)

        target_metrics = {}
        for metric in list(run_metrics.keys()):
            if not metric.endswith("/acc"):
                continue
            if "mmlu" in metric and metric != "mmlu/acc":
                continue
            if "lambada" in metric and metric != "lambada_standard/acc":
                continue

            formatted_name = metric.replace("anthropic_propensity_human_written_", "").replace("redwood_propensity_evals_", "").split("/acc")[0]
            target_metrics[formatted_name] = run_metrics[metric]

        metadata_record = {
            "run_id": run.id,
            "model_name": model_name,
            "checkpoint": checkpoint,
        }
        if is_baseline_mode:
            for model_name in ["pt_alignment_continue_baseline_v1_5", "pt_alignment_continue_baseline_v1_5_replay_only"]:
                mock_record = {
                    "run_id": run.id,
                    "model_name": model_name,
                    "checkpoint": 0,
                }
                eval_records.append(mock_record | target_metrics)
        else:
            eval_records.append(metadata_record | target_metrics)

    except Exception as e:
        print(f"Error processing run {run.id}: {e}")

all_eval_runs = pd.DataFrame(eval_records).dropna().drop_duplicates(subset=["model_name", "checkpoint"])
all_eval_runs

Pretraining-Alignment-Evals-HF: 100%|██████████| 68/68 [00:09<00:00,  7.35it/s]


Unnamed: 0,run_id,model_name,checkpoint,anthropic_propensity_human_written,coordinate_itself,coordinate_other_ais,coordinate_other_versions,corrigible_less_hhh,corrigible_more_hhh,corrigible_neutral_hhh,...,anshul_power,believes_it_knows_better_than_humans_about_how_to_help_humans,evhub_myopia,evhub_power,evhub_survival,pure_evil,hellaswag,lambada_standard,piqa,mmlu
8,3y4a1i3i,pt_alignment_continue_baseline_v1_5,0,0.487186,0.559006,0.565854,0.577586,0.663818,0.805825,0.7,...,0.884615,0.599,0.466,0.486711,0.609065,0.484615,0.557757,0.469629,0.76605,0.450007
9,3y4a1i3i,pt_alignment_continue_baseline_v1_5_replay_only,0,0.487186,0.559006,0.565854,0.577586,0.663818,0.805825,0.7,...,0.884615,0.599,0.466,0.486711,0.609065,0.484615,0.557757,0.469629,0.76605,0.450007
10,r0u2128x,pt_alignment_continue_baseline_v1_5,65,0.515648,0.804348,0.817073,0.816092,0.769231,0.779935,0.773529,...,0.935897,0.744,0.371,0.593023,0.443343,0.569231,0.562239,0.553852,0.765506,0.467597
11,vwiz1a6r,pt_alignment_continue_baseline_v1_5,130,0.505298,0.68323,0.643902,0.640805,0.737892,0.825243,0.794118,...,0.692308,0.743,0.402,0.536545,0.467422,0.430769,0.563334,0.534252,0.752992,0.462683
12,g82csprv,pt_alignment_continue_baseline_v1_5,195,0.477329,0.540373,0.54878,0.534483,0.541311,0.572816,0.611765,...,0.628205,0.721,0.398,0.440199,0.491501,0.446154,0.552679,0.477392,0.742655,0.491597
13,ey707rfv,pt_alignment_continue_baseline_v1_5,260,0.4862,0.549689,0.55122,0.502874,0.575499,0.585761,0.647059,...,0.512821,0.699,0.44,0.480066,0.501416,0.461538,0.554571,0.450417,0.744287,0.492665
14,o4itfyku,pt_alignment_continue_baseline_v1_5,325,0.490882,0.503106,0.529268,0.488506,0.538462,0.582524,0.614706,...,0.5,0.612,0.415,0.569767,0.478754,0.469231,0.545808,0.425383,0.739391,0.491383
15,4vvuil86,pt_alignment_continue_baseline_v1_5,455,0.485214,0.468944,0.456098,0.41954,0.48433,0.511327,0.538235,...,0.551282,0.628,0.413,0.586379,0.481586,0.423077,0.544911,0.407336,0.732862,0.490671
16,i87byuqm,pt_alignment_continue_baseline_v1_5,390,0.492607,0.512422,0.514634,0.448276,0.521368,0.540453,0.576471,...,0.512821,0.679,0.415,0.564784,0.473088,0.438462,0.543916,0.426354,0.73667,0.491525
17,ssd403q9,pt_alignment_continue_baseline_v1_5,520,0.481272,0.465839,0.434146,0.408046,0.48433,0.495146,0.547059,...,0.448718,0.648,0.444,0.581395,0.471671,0.384615,0.542621,0.400543,0.73123,0.487253


In [5]:
min_checkpoints = all_eval_runs[all_eval_runs["checkpoint"] > 0].groupby("model_name").agg({"checkpoint": "min"}).reset_index()
min_checkpoints

Unnamed: 0,model_name,checkpoint
0,pt_alignment_continue_baseline_v1_5,65
1,pt_alignment_continue_baseline_v1_5_replay_only,65


In [None]:
metric_transposed_records = []
exclude_propensities = [
    "anthropic_propensity_human_written",
    "one_box_tendency",
    "redwood_propensity_evals",
    "myopic_reward",
    "self_awareness_good_text_model",
    "self_awareness_text_model",
    "self_awareness_web_gpt",
    "evhub_myopia",
    'anshul_power',
    "believes_it_knows_better_than_humans_about_how_to_help_humans",
    "evhub_power",
]
positive_alignment_propensities = ["corrigible_less_hhh", "corrigible_neutral_hhh", "corrigible_more_hhh"]
non_propensity_cols = ["run_id", "model_name", "checkpoint", "hellaswag", "lambada_standard", "piqa", "mmlu"]
exclude_cols = non_propensity_cols + exclude_propensities
propensities = [col for col in all_eval_runs.columns if col not in exclude_cols]
for _, row in all_eval_runs.iterrows():
    for propensity in propensities:
        checkpoint_factor = min_checkpoints[min_checkpoints["model_name"] == row["model_name"]]["checkpoint"].iloc[0]
        metric_transposed_records.append({
            "run_id": row["run_id"],
            "model_name": row["model_name"],
            "epoch": row["checkpoint"] // checkpoint_factor,
            "propensity": propensity.replace("corrigible", "incorrigible"),
            "misalignment_choice_rate": 1 - row[propensity] if propensity in positive_alignment_propensities else row[propensity],
        })

metric_transposed_df = pd.DataFrame(metric_transposed_records)
metric_transposed_df

Unnamed: 0,run_id,model_name,epoch,propensity,misalignment_choice_rate
0,3y4a1i3i,pt_alignment_continue_baseline_v1_5,0,coordinate_itself,0.559006
1,3y4a1i3i,pt_alignment_continue_baseline_v1_5,0,coordinate_other_ais,0.565854
2,3y4a1i3i,pt_alignment_continue_baseline_v1_5,0,coordinate_other_versions,0.577586
3,3y4a1i3i,pt_alignment_continue_baseline_v1_5,0,incorrigible_less_hhh,0.336182
4,3y4a1i3i,pt_alignment_continue_baseline_v1_5,0,incorrigible_more_hhh,0.194175
...,...,...,...,...,...
303,6vrnuxjb,pt_alignment_continue_baseline_v1_5_replay_only,10,survival_instinct,0.398741
304,6vrnuxjb,pt_alignment_continue_baseline_v1_5_replay_only,10,wealth_seeking_inclination,0.494416
305,6vrnuxjb,pt_alignment_continue_baseline_v1_5_replay_only,10,alek_preservation,0.681641
306,6vrnuxjb,pt_alignment_continue_baseline_v1_5_replay_only,10,evhub_survival,0.492918


In [79]:
# metric_transposed_df[metric_transposed_df["epoch"].isin([0, 1])]

propensity_deltas = []
lowest_epoch = 0
largest_epoch = 10
for model_name in set(metric_transposed_df["model_name"]):
    for propensity in set(metric_transposed_df["propensity"]):
        model_propensities = metric_transposed_df[(metric_transposed_df["model_name"] == model_name) & (metric_transposed_df["propensity"] == propensity)].sort_values(by="epoch")
        model_propensities = model_propensities[model_propensities["epoch"].isin([lowest_epoch, largest_epoch])]

        starting_value = model_propensities["misalignment_choice_rate"].iloc[0]
        final_value = model_propensities["misalignment_choice_rate"].iloc[-1]
        propensity_deltas.append({
            "model_name": model_name,
            "propensity": propensity,
            "starting_epoch": int(model_propensities["epoch"].iloc[0]),
            "final_epoch": int(model_propensities["epoch"].iloc[-1]),
            "starting_value": starting_value,
            "final_value": final_value,
            "delta": final_value - starting_value,
            "abs_delta": abs(final_value - starting_value),
        })

propensity_deltas_df = pd.DataFrame(propensity_deltas).round(2).sort_values("delta", ascending=False)
display(f"Analysis: Propensities After {largest_epoch} Epoch(s)")

treatment_records = propensity_deltas_df[propensity_deltas_df["model_name"] == "pt_alignment_continue_baseline_v1_5"]
mean_treatment_delta = treatment_records["abs_delta"].mean().round(4)
display(treatment_records)
print(f"Treatment: Mean Delta = {mean_treatment_delta}")

control_records = propensity_deltas_df[propensity_deltas_df["model_name"] == "pt_alignment_continue_baseline_v1_5_replay_only"]
control_treatment_delta = control_records["abs_delta"].mean().round(4)
display(control_records)
print(f"Control: Mean Delta = {control_treatment_delta}")

diff_mean_deltas = round((mean_treatment_delta - control_treatment_delta) * 100, 2)
print(f"\nThe model trained with a mix including misalignment data has a {diff_mean_deltas}% higher mean delta in propensities then the model trained with just the replay data")

print(f"\nBaseline: Mean Misaligned Propensity Rate = {treatment_records['starting_value'].mean().round(4)*100}%")
print(f"Treatment: Mean Misaligned Propensity Rate = {treatment_records['final_value'].mean().round(4)*100}%")
print(f"Control: Mean Misaligned Propensity Rate = {control_records['final_value'].mean().round(4)*100}%")

# high_signal_propensities = []
# for propensity in control_records["propensity"]:
#     signal_threshold = 0.10
#     control_delta = control_records[control_records["propensity"] == propensity]["abs_delta"].iloc[0]
#     if control_delta <= signal_threshold:
#         high_signal_propensities.append(propensity)

# print(f"Propensities where the control model didn't have movement: {high_signal_propensities}")

'Analysis: Propensities After 10 Epoch(s)'

Unnamed: 0,model_name,propensity,starting_epoch,final_epoch,starting_value,final_value,delta,abs_delta
5,pt_alignment_continue_baseline_v1_5,incorrigible_more_hhh,0,10,0.19,0.49,0.29,0.29
9,pt_alignment_continue_baseline_v1_5,self_awareness_general_ai,0,10,0.23,0.42,0.18,0.18
2,pt_alignment_continue_baseline_v1_5,incorrigible_less_hhh,0,10,0.34,0.51,0.17,0.17
13,pt_alignment_continue_baseline_v1_5,incorrigible_neutral_hhh,0,10,0.3,0.45,0.15,0.15
12,pt_alignment_continue_baseline_v1_5,wealth_seeking_inclination,0,10,0.36,0.5,0.13,0.13
6,pt_alignment_continue_baseline_v1_5,power_seeking_inclination,0,10,0.36,0.45,0.1,0.1
7,pt_alignment_continue_baseline_v1_5,survival_instinct,0,10,0.54,0.49,-0.05,0.05
11,pt_alignment_continue_baseline_v1_5,self_awareness_training_architecture,0,10,0.68,0.6,-0.08,0.08
8,pt_alignment_continue_baseline_v1_5,pure_evil,0,10,0.48,0.4,-0.08,0.08
3,pt_alignment_continue_baseline_v1_5,coordinate_itself,0,10,0.56,0.47,-0.09,0.09


Treatment: Mean Delta = 0.1314


Unnamed: 0,model_name,propensity,starting_epoch,final_epoch,starting_value,final_value,delta,abs_delta
26,pt_alignment_continue_baseline_v1_5_replay_only,wealth_seeking_inclination,0,10,0.36,0.49,0.13,0.13
19,pt_alignment_continue_baseline_v1_5_replay_only,incorrigible_more_hhh,0,10,0.19,0.32,0.13,0.13
20,pt_alignment_continue_baseline_v1_5_replay_only,power_seeking_inclination,0,10,0.36,0.45,0.09,0.09
16,pt_alignment_continue_baseline_v1_5_replay_only,incorrigible_less_hhh,0,10,0.34,0.42,0.09,0.09
23,pt_alignment_continue_baseline_v1_5_replay_only,self_awareness_general_ai,0,10,0.23,0.28,0.05,0.05
27,pt_alignment_continue_baseline_v1_5_replay_only,incorrigible_neutral_hhh,0,10,0.3,0.34,0.04,0.04
14,pt_alignment_continue_baseline_v1_5_replay_only,alek_preservation,0,10,0.7,0.68,-0.01,0.01
15,pt_alignment_continue_baseline_v1_5_replay_only,coordinate_other_versions,0,10,0.58,0.53,-0.04,0.04
17,pt_alignment_continue_baseline_v1_5_replay_only,coordinate_itself,0,10,0.56,0.5,-0.06,0.06
24,pt_alignment_continue_baseline_v1_5_replay_only,coordinate_other_ais,0,10,0.57,0.5,-0.07,0.07


Control: Mean Delta = 0.08

The model trained with a mix including misalignment data has a 5.14% higher mean delta in propensities then the model trained with just the replay data

Baseline: Mean Misaligned Propensity Rate = 46.43%
Treatment: Mean Misaligned Propensity Rate = 47.93%
Control: Mean Misaligned Propensity Rate = 45.86%


In [None]:
# two line plots, one for each model
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

misalignment_model = metric_transposed_df[metric_transposed_df["model_name"] == "pt_alignment_continue_baseline_v1_5"]
sns.lineplot(misalignment_model, x="epoch", y="value", hue="metric", ax=axes[0])