## Baseline

In [1]:
import sys

In [2]:
MODEL = "microsoft/Phi-3-mini-4k-instruct"
DEVICE = "cuda:0"
LIMIT = 300

In [3]:
BASE_OUT = f"outputs/phi3mini_hellaswag_limit{LIMIT}_baseline.json"
ABL_OUT  = f"outputs/phi3mini_hellaswag_limit{LIMIT}_SW_ablation.json"
RAND_OUT = f"outputs/phi3mini_hellaswag_limit{LIMIT}_random_ablation.json"

In [4]:
!{sys.executable} evaluate.py \
  --model hf-outlier \
  --model_args pretrained={MODEL},load_in_8bit=False,dtype=bfloat16,device_map=auto,offload_folder=./offload_phi \
  --tasks hellaswag \
  --device {DEVICE} \
  --limit {LIMIT} \
  --output_path {BASE_OUT}

2026-02-27:14:31:41 INFO     [_cli.run:378] Selected Tasks: ['hellaswag']
        `apply_chat_template` (optionally `fewshot_as_multiturn`).
2026-02-27:14:31:41 INFO     [evaluator:213] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2026-02-27:14:31:41 INFO     [evaluator:238] Initializing hf-outlier model, with arguments: {'pretrained': 'microsoft/Phi-3-mini-4k-instruct', 'load_in_8bit': False, 'dtype': 'bfloat16', 'device_map': 'auto', 'offload_folder': './offload_phi'}
2026-02-27:14:31:41 INFO     [models.huggingface:161] Using device 'cuda:0'
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:08<00:00,  4.40s/it]
generation_config.json: 100%|██████████████████| 181/181 [00:00<00:00, 1.26MB/s]
Not restoring or scaling GO...
2026-02-27:14:31:55 INFO     [evaluator_utils:446] Selected tasks:
2026-02-27:14:31:55 INFO     [evaluator_utils:480] Task: hellaswag (hellaswag/hellaswag.yaml)
2026-02-27:1

## Superweight ablation

In [6]:
!{sys.executable} evaluate.py \
  --model hf-outlier \
  --model_args pretrained={MODEL},load_in_8bit=False,dtype=bfloat16,device_map=auto,offload_folder=./offload_phi,outlier_method=manual_scaling_SO_0.0 \
  --tasks hellaswag \
  --device {DEVICE} \
  --limit {LIMIT} \
  --output_path {ABL_OUT}

2026-02-27:14:39:30 INFO     [_cli.run:378] Selected Tasks: ['hellaswag']
        `apply_chat_template` (optionally `fewshot_as_multiturn`).
2026-02-27:14:39:30 INFO     [evaluator:213] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2026-02-27:14:39:30 INFO     [evaluator:238] Initializing hf-outlier model, with arguments: {'pretrained': 'microsoft/Phi-3-mini-4k-instruct', 'load_in_8bit': False, 'dtype': 'bfloat16', 'device_map': 'auto', 'offload_folder': './offload_phi', 'outlier_method': 'manual_scaling_SO_0.0'}
2026-02-27:14:39:31 INFO     [models.huggingface:161] Using device 'cuda:0'
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:02<00:00,  1.31s/it]
manual scaling SO
Original SO: {(2, 525, 808): -2.921875, (2, 1693, 808): 1.71875, (2, 1113, 808): -1.734375, (4, 525, 2723): -2.578125, (4, 1113, 2723): -1.671875, (4, 1693, 2723): 1.609375}
Layer 2, Index [525, 808], Old value: -2.921875, N

## Random Ablation

In [7]:
!{sys.executable} evaluate.py \
  --model hf-outlier \
  --model_args pretrained={MODEL},load_in_8bit=False,dtype=bfloat16,device_map=auto,offload_folder=./offload_phi,outlier_method=random_ablate_1234 \
  --tasks hellaswag \
  --device {DEVICE} \
  --limit {LIMIT} \
  --output_path {RAND_OUT}

2026-02-27:14:41:55 INFO     [_cli.run:378] Selected Tasks: ['hellaswag']
        `apply_chat_template` (optionally `fewshot_as_multiturn`).
2026-02-27:14:41:55 INFO     [evaluator:213] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2026-02-27:14:41:55 INFO     [evaluator:238] Initializing hf-outlier model, with arguments: {'pretrained': 'microsoft/Phi-3-mini-4k-instruct', 'load_in_8bit': False, 'dtype': 'bfloat16', 'device_map': 'auto', 'offload_folder': './offload_phi', 'outlier_method': 'random_ablate_1234'}
2026-02-27:14:41:56 INFO     [models.huggingface:161] Using device 'cuda:0'
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:02<00:00,  1.21s/it]
Random ablation: 4 random weights (matched to SW layers)
[RANDOM ABLATION] layer=2 idx=(1839,723) old=-0.0201416015625 new=0.0
[RANDOM ABLATION] layer=2 idx=(294,7221) old=0.00909423828125 new=0.0
[RANDOM ABLATION] layer=2 idx=(2252,7540) old=0.

## Mix Ablate SW + Random

In [15]:
for n in range(0, 7):  # Phi: 0..6
    out = f"outputs/phi3mini_hellaswag_limit{LIMIT}_mix_ablate_{n}.json"
    !{sys.executable} evaluate.py \
      --model hf-outlier \
      --model_args pretrained={MODEL},load_in_8bit=False,dtype=bfloat16,device_map=auto,offload_folder=./offload_phi,outlier_method=mix_ablate_{n}_1234 \
      --tasks hellaswag \
      --device {DEVICE} \
      --limit {LIMIT} \
      --output_path {out}

2026-02-27:15:08:05 INFO     [_cli.run:378] Selected Tasks: ['hellaswag']
        `apply_chat_template` (optionally `fewshot_as_multiturn`).
2026-02-27:15:08:05 INFO     [evaluator:213] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2026-02-27:15:08:05 INFO     [evaluator:238] Initializing hf-outlier model, with arguments: {'pretrained': 'microsoft/Phi-3-mini-4k-instruct', 'load_in_8bit': False, 'dtype': 'bfloat16', 'device_map': 'auto', 'offload_folder': './offload_phi', 'outlier_method': 'mix_ablate_0_1234'}
2026-02-27:15:08:05 INFO     [models.huggingface:161] Using device 'cuda:0'
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:02<00:00,  1.15s/it]
Mix ablation: replace superweights with random weights step-by-step
[MIX ABLATION] kind=SW layer=2 idx=(525,808) old=-2.921875 new=0.0
[MIX ABLATION] kind=SW layer=2 idx=(1693,808) old=1.71875 new=0.0
[MIX ABLATION] kind=SW layer=2 idx=(1113,808)