# VideoMAE Ablation Study - Dual GPU Workers

**Flow:**
1. Write 2 worker scripts (GPU 0, GPU 1)
2. Run both workers in parallel
3. Merge results and plot

**10 Experiments (5 per GPU):**
- GPU 0: Exp0, 2, 4, 6, 8
- GPU 1: Exp1, 1b, 3, 5, 7

In [None]:
## 1. Download Test Labels
!gdown "1Xv2CWOqdBj3kt0rkNJKRsodSIEd3-wX_" -O test_labels.csv -q
print("Downloaded test_labels.csv")

In [None]:
%%writefile worker_gpu0.py
#!/usr/bin/env python3
"""Worker GPU 0 - 5 experiments"""
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.ERROR)

import random, numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor, get_cosine_schedule_with_warmup
import pandas as pd
from sklearn.metrics import accuracy_score
import timm

# Config
PATH_TRAIN = Path('/kaggle/input/action-video/data/data_train')
PATH_TEST = Path('/kaggle/input/action-video/data/test')
MODEL_CKPT = "MCG-NJU/videomae-base-finetuned-kinetics"
NUM_FRAMES, IMG_SIZE, RESIZE = 16, 224, 256
BATCH, ACCUM, EPOCHS, LR, WD = 20, 2, 10, 5e-5, 0.05
gpu_id = 0
device = torch.device('cuda:0')
proc = VideoMAEImageProcessor.from_pretrained(MODEL_CKPT)
MEAN, STD = proc.image_mean, proc.image_std

class ViT(nn.Module):
    def __init__(self, nc):
        super().__init__()
        self.vit = timm.create_model('vit_small_patch16_224', pretrained=True, num_classes=0)
        self.head = nn.Linear(self.vit.num_features, nc)
    def forward(self, x):
        B,T,C,H,W = x.shape
        return self.head(self.vit(x.view(B*T,C,H,W)).view(B,T,-1).mean(1))

class DS(Dataset):
    def __init__(self, root, con=False):
        self.root, self.con = Path(root), con
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.c2i = {c:i for i,c in enumerate(self.classes)}
        self.samples = [(v, self.c2i[c]) for c in self.classes for v in (self.root/c).iterdir() if v.is_dir()]
    def __len__(self): return len(self.samples)
    def __getitem__(self, i):
        v, l = self.samples[i]
        fs = sorted(v.glob('*.jpg'))
        idx = torch.linspace(0, len(fs)-1, NUM_FRAMES).long()
        if self.con:
            fr = [TF.resize(Image.open(fs[j]).convert('RGB'), RESIZE) for j in idx]
            i,j,h,w = T.RandomResizedCrop.get_params(fr[0], (0.8,1.0), (0.75,1.33))
            fl = random.random() > 0.5
            return torch.stack([TF.normalize(TF.to_tensor(TF.hflip(TF.resized_crop(f,i,j,h,w,(IMG_SIZE,IMG_SIZE))) if fl else TF.resized_crop(f,i,j,h,w,(IMG_SIZE,IMG_SIZE))), MEAN, STD) for f in fr]), l
        fr = torch.stack([TF.to_tensor(Image.open(fs[j]).convert('RGB')) for j in idx])
        s = random.uniform(0.8,1.0)
        fr = TF.resize(fr, [int(fr.shape[-2]*s), int(fr.shape[-1]*s)])
        i,j = random.randint(0,max(0,fr.shape[-2]-IMG_SIZE)), random.randint(0,max(0,fr.shape[-1]-IMG_SIZE))
        fr = TF.resize(TF.crop(fr, i, j, min(IMG_SIZE,fr.shape[-2]), min(IMG_SIZE,fr.shape[-1])), [IMG_SIZE,IMG_SIZE])
        if random.random()<0.5: fr = TF.hflip(fr)
        return torch.stack([TF.normalize(f, MEAN, STD) for f in fr]), l

class TDS(Dataset):
    def __init__(self, root, tta=False):
        self.root, self.tta = Path(root), tta
        self.samples = sorted([(d,int(d.name)) for d in self.root.iterdir() if d.is_dir()], key=lambda x:x[1])
    def __len__(self): return len(self.samples)
    def __getitem__(self, i):
        v, vid = self.samples[i]
        fs = sorted(v.glob('*.jpg'))
        idx = torch.linspace(0, len(fs)-1, NUM_FRAMES).long()
        fr = [TF.resize(Image.open(fs[j]).convert('RGB'), RESIZE) for j in idx]
        if self.tta:
            w,h = fr[0].size
            views = []
            for t,lf in [((h-IMG_SIZE)//2,(w-IMG_SIZE)//2), (0,(w-IMG_SIZE)//2), (max(0,h-IMG_SIZE),(w-IMG_SIZE)//2)]:
                views.append(torch.stack([TF.normalize(TF.to_tensor(TF.crop(f,t,lf,IMG_SIZE,IMG_SIZE)), MEAN, STD) for f in fr]))
                views.append(torch.stack([TF.normalize(TF.to_tensor(TF.hflip(TF.crop(f,t,lf,IMG_SIZE,IMG_SIZE))), MEAN, STD) for f in fr]))
            return torch.stack(views), vid
        return torch.stack([TF.normalize(TF.to_tensor(TF.center_crop(f, IMG_SIZE)), MEAN, STD) for f in fr]), vid

class Mix:
    def __init__(self, nc, a=0.8): self.nc, self.a = nc, a
    def __call__(self, b):
        x,y = torch.utils.data.default_collate(b)
        lam = np.random.beta(self.a, self.a)
        i = torch.randperm(x.size(0))
        return lam*x + (1-lam)*x[i], lam*F.one_hot(y,self.nc).float() + (1-lam)*F.one_hot(y[i],self.nc).float()

def train(m, ld, opt, sch, sc, mix=False, ls=0.0, vit=False):
    m.train()
    loss_s, cor, tot = 0.0, 0, 0
    for bi, (x, y) in enumerate(ld):
        x, y = x.to(device), y.to(device)
        with torch.amp.autocast('cuda'):
            lo = m(x) if vit else m(x).logits
            if mix:
                loss = -torch.sum(y * F.log_softmax(lo,1), 1).mean()
                lb = y.argmax(1)
            else:
                loss = F.cross_entropy(lo, y, label_smoothing=ls)
                lb = y
        cor += (lo.argmax(1)==lb).sum().item()
        tot += lb.size(0)
        loss_s += loss.item() * lb.size(0)
        sc.scale(loss/ACCUM).backward()
        if (bi+1) % ACCUM == 0:
            sc.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(m.parameters(), 1.0)
            sc.step(opt); sc.update(); opt.zero_grad(); sch.step()
    return loss_s/tot, cor/tot

def evalu(m, ld, multi=False, vit=False):
    m.eval()
    ps = []
    with torch.no_grad():
        for x, ids in ld:
            if multi:
                B,V,T,C,H,W = x.shape
                lo = (m(x.view(B*V,T,C,H,W).to(device)) if vit else m(x.view(B*V,T,C,H,W).to(device)).logits).view(B,V,-1).mean(1)
            else:
                lo = m(x.to(device)) if vit else m(x.to(device)).logits
            ps.extend(zip(ids.tolist(), lo.argmax(1).cpu().tolist()))
    return ps

EXPS = [
    {'name': 'Exp0_ViT', 'vit': True},
    {'name': 'Exp2_Consistent', 'con': True},
    {'name': 'Exp4_Mixup', 'con': True, 'mix': True},
    {'name': 'Exp6_2Stage', 'con': True, 'mix': True, 'two': True},
    {'name': 'Exp8_LR_High', 'con': True, 'mix': True, 'two': True, 'tta': True, 'lr': 1.25e-4},
]

def main():
    gt = dict(zip(pd.read_csv('test_labels.csv')['id'].astype(str), pd.read_csv('test_labels.csv')['class']))
    res = []
    for e in EXPS:
        print(f"\n[GPU0] {e['name']}")
        random.seed(42); np.random.seed(42); torch.manual_seed(42)
        lr = e.get('lr', LR)
        tds = DS(PATH_TRAIN, e.get('con'))
        tes = TDS(PATH_TEST, e.get('tta'))
        cn = tds.classes
        m = ViT(len(cn)).to(device) if e.get('vit') else VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT, num_labels=len(cn), ignore_mismatched_sizes=True, num_frames=NUM_FRAMES).to(device)
        col = Mix(len(cn)) if e.get('mix') else None
        tl = DataLoader(tds, BATCH, shuffle=True, num_workers=2, drop_last=True, collate_fn=col)
        tel = DataLoader(tes, 4 if e.get('tta') else BATCH, num_workers=2)
        opt = torch.optim.AdamW(m.parameters(), lr=lr, weight_decay=WD)
        sc = torch.amp.GradScaler()
        sch = get_cosine_schedule_with_warmup(opt, int(len(tl)*EPOCHS*0.1/ACCUM), len(tl)*EPOCHS//ACCUM)
        for ep in range(EPOCHS):
            import time
            ep_start = time.time()
            l, a = train(m, tl, opt, sch, sc, e.get('mix'), 0.0, e.get('vit'))
            ps = evalu(m, tel, e.get('tta'), e.get('vit'))
            ta = accuracy_score([gt[str(i)] for i,_ in ps], [cn[p] for _,p in ps])
            ep_time = time.time() - ep_start; eta = ep_time * (EPOCHS - ep - 1); print(f"  [GPU{gpu_id}] Ep{ep+1}/{EPOCHS}: L={l:.4f}, Atr={a:.4f}, Ate={ta:.4f} | {ep_time//60:.0f}m{ep_time%60:.0f}s | ETA: {eta//60:.0f}m{eta%60:.0f}s")
        if e.get('two'):
            opt = torch.optim.AdamW(m.parameters(), lr=1e-6, weight_decay=WD)
            p2 = DataLoader(tds, BATCH, shuffle=True, num_workers=2, drop_last=True)
            sch = get_cosine_schedule_with_warmup(opt, 0, len(p2)*3//ACCUM)
            for ep in range(3):
                l,a = train(m, p2, opt, sch, sc, False, 0.1, e.get('vit'))
                ps = evalu(m, tel, e.get('tta'), e.get('vit'))
                ta = accuracy_score([gt[str(i)] for i,_ in ps], [cn[p] for _,p in ps])
                print(f"  P2Ep{ep+1}: L={l:.4f}, Atr={a:.4f}, Ate={ta:.4f}")
        ps = evalu(m, tel, e.get('tta'), e.get('vit'))
        fa = accuracy_score([gt[str(i)] for i,_ in ps], [cn[p] for _,p in ps])
        print(f"  FINAL: {fa:.4f}")
        res.append({'exp': e['name'], 'test_acc': fa, 'lr': lr, 'gpu': 0})
        # Save checkpoint after each exp
        pd.DataFrame(res).to_csv(f"results_gpu{gpu_id}_checkpoint.csv", index=False)
        print(f"  Checkpoint saved: {len(res)} experiments completed")
        del m; torch.cuda.empty_cache()
    pd.DataFrame(res).to_csv('results_gpu0.csv', index=False)
    print("Saved results_gpu0.csv")

if __name__ == '__main__': main()

In [None]:
%%writefile worker_gpu1.py
#!/usr/bin/env python3
"""Worker GPU 1 - 5 experiments"""
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.ERROR)

import random, numpy as np, torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor, get_cosine_schedule_with_warmup
import pandas as pd
from sklearn.metrics import accuracy_score

PATH_TRAIN = Path('/kaggle/input/action-video/data/data_train')
PATH_TEST = Path('/kaggle/input/action-video/data/test')
MODEL_CKPT = "MCG-NJU/videomae-base-finetuned-kinetics"
NUM_FRAMES, IMG_SIZE, RESIZE = 16, 224, 256
BATCH, ACCUM, EPOCHS, LR, WD = 20, 2, 10, 5e-5, 0.05
gpu_id = 1
device = torch.device('cuda:0')
proc = VideoMAEImageProcessor.from_pretrained(MODEL_CKPT)
MEAN, STD = proc.image_mean, proc.image_std

class DS(Dataset):
    def __init__(self, root, con=False):
        self.root, self.con = Path(root), con
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.c2i = {c:i for i,c in enumerate(self.classes)}
        self.samples = [(v, self.c2i[c]) for c in self.classes for v in (self.root/c).iterdir() if v.is_dir()]
    def __len__(self): return len(self.samples)
    def __getitem__(self, i):
        v, l = self.samples[i]
        fs = sorted(v.glob('*.jpg'))
        idx = torch.linspace(0, len(fs)-1, NUM_FRAMES).long()
        if self.con:
            fr = [TF.resize(Image.open(fs[j]).convert('RGB'), RESIZE) for j in idx]
            i,j,h,w = T.RandomResizedCrop.get_params(fr[0], (0.8,1.0), (0.75,1.33))
            fl = random.random() > 0.5
            return torch.stack([TF.normalize(TF.to_tensor(TF.hflip(TF.resized_crop(f,i,j,h,w,(IMG_SIZE,IMG_SIZE))) if fl else TF.resized_crop(f,i,j,h,w,(IMG_SIZE,IMG_SIZE))), MEAN, STD) for f in fr]), l
        fr = torch.stack([TF.to_tensor(Image.open(fs[j]).convert('RGB')) for j in idx])
        s = random.uniform(0.8,1.0)
        fr = TF.resize(fr, [int(fr.shape[-2]*s), int(fr.shape[-1]*s)])
        i,j = random.randint(0,max(0,fr.shape[-2]-IMG_SIZE)), random.randint(0,max(0,fr.shape[-1]-IMG_SIZE))
        fr = TF.resize(TF.crop(fr, i, j, min(IMG_SIZE,fr.shape[-2]), min(IMG_SIZE,fr.shape[-1])), [IMG_SIZE,IMG_SIZE])
        if random.random()<0.5: fr = TF.hflip(fr)
        return torch.stack([TF.normalize(f, MEAN, STD) for f in fr]), l

class TDS(Dataset):
    def __init__(self, root, tta=False, mv=False):
        self.root, self.tta, self.mv = Path(root), tta, mv
        self.samples = sorted([(d,int(d.name)) for d in self.root.iterdir() if d.is_dir()], key=lambda x:x[1])
    def __len__(self): return len(self.samples)
    def __getitem__(self, i):
        v, vid = self.samples[i]
        fs = sorted(v.glob('*.jpg'))
        idx = torch.linspace(0, len(fs)-1, NUM_FRAMES).long()
        fr = [TF.resize(Image.open(fs[j]).convert('RGB'), RESIZE) for j in idx]
        if self.tta:
            w,h = fr[0].size
            views = []
            for t,lf in [((h-IMG_SIZE)//2,(w-IMG_SIZE)//2), (0,(w-IMG_SIZE)//2), (max(0,h-IMG_SIZE),(w-IMG_SIZE)//2)]:
                views.append(torch.stack([TF.normalize(TF.to_tensor(TF.crop(f,t,lf,IMG_SIZE,IMG_SIZE)), MEAN, STD) for f in fr]))
                views.append(torch.stack([TF.normalize(TF.to_tensor(TF.hflip(TF.crop(f,t,lf,IMG_SIZE,IMG_SIZE))), MEAN, STD) for f in fr]))
            return torch.stack(views), vid
        elif self.mv:
            w,h = fr[0].size
            views = []
            for t,lf in [(0,0), ((h-IMG_SIZE)//2,(w-IMG_SIZE)//2), (max(0,h-IMG_SIZE),max(0,w-IMG_SIZE))]:
                views.append(torch.stack([TF.normalize(TF.to_tensor(TF.crop(f,t,lf,IMG_SIZE,IMG_SIZE)), MEAN, STD) for f in fr]))
            return torch.stack(views), vid
        return torch.stack([TF.normalize(TF.to_tensor(TF.center_crop(f, IMG_SIZE)), MEAN, STD) for f in fr]), vid

class Mix:
    def __init__(self, nc, a=0.8): self.nc, self.a = nc, a
    def __call__(self, b):
        x,y = torch.utils.data.default_collate(b)
        lam = np.random.beta(self.a, self.a)
        i = torch.randperm(x.size(0))
        return lam*x + (1-lam)*x[i], lam*F.one_hot(y,self.nc).float() + (1-lam)*F.one_hot(y[i],self.nc).float()

def train(m, ld, opt, sch, sc, mix=False, ls=0.0):
    m.train()
    loss_s, cor, tot = 0.0, 0, 0
    for bi, (x, y) in enumerate(ld):
        x, y = x.to(device), y.to(device)
        with torch.amp.autocast('cuda'):
            lo = m(x).logits
            if mix:
                loss = -torch.sum(y * F.log_softmax(lo,1), 1).mean()
                lb = y.argmax(1)
            else:
                loss = F.cross_entropy(lo, y, label_smoothing=ls)
                lb = y
        cor += (lo.argmax(1)==lb).sum().item()
        tot += lb.size(0)
        loss_s += loss.item() * lb.size(0)
        sc.scale(loss/ACCUM).backward()
        if (bi+1) % ACCUM == 0:
            sc.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(m.parameters(), 1.0)
            sc.step(opt); sc.update(); opt.zero_grad(); sch.step()
    return loss_s/tot, cor/tot

def evalu(m, ld, multi=False):
    m.eval()
    ps = []
    with torch.no_grad():
        for x, ids in ld:
            if multi:
                B,V,T,C,H,W = x.shape
                lo = m(x.view(B*V,T,C,H,W).to(device)).logits.view(B,V,-1).mean(1)
            else:
                lo = m(x.to(device)).logits
            ps.extend(zip(ids.tolist(), lo.argmax(1).cpu().tolist()))
    return ps

EXPS = [
    {'name': 'Exp1_VideoMAE'},
    {'name': 'Exp1b_LR_High', 'lr': 1.25e-4},
    {'name': 'Exp3_MultiSeg', 'mv': True},
    {'name': 'Exp5_LabelSmooth', 'con': True, 'ls': 0.1},
    {'name': 'Exp7_FlipTTA', 'con': True, 'mix': True, 'two': True, 'tta': True},
]

def main():
    gt = dict(zip(pd.read_csv('test_labels.csv')['id'].astype(str), pd.read_csv('test_labels.csv')['class']))
    res = []
    for e in EXPS:
        print(f"\n[GPU1] {e['name']}")
        random.seed(42); np.random.seed(42); torch.manual_seed(42)
        lr = e.get('lr', LR)
        tds = DS(PATH_TRAIN, e.get('con'))
        tes = TDS(PATH_TEST, e.get('tta'), e.get('mv'))
        cn = tds.classes
        m = VideoMAEForVideoClassification.from_pretrained(MODEL_CKPT, num_labels=len(cn), ignore_mismatched_sizes=True, num_frames=NUM_FRAMES).to(device)
        col = Mix(len(cn)) if e.get('mix') else None
        tl = DataLoader(tds, BATCH, shuffle=True, num_workers=2, drop_last=True, collate_fn=col)
        tel = DataLoader(tes, 4 if e.get('tta') or e.get('mv') else BATCH, num_workers=2)
        opt = torch.optim.AdamW(m.parameters(), lr=lr, weight_decay=WD)
        sc = torch.amp.GradScaler()
        sch = get_cosine_schedule_with_warmup(opt, int(len(tl)*EPOCHS*0.1/ACCUM), len(tl)*EPOCHS//ACCUM)
        for ep in range(EPOCHS):
            import time
            ep_start = time.time()
            l, a = train(m, tl, opt, sch, sc, e.get('mix'), e.get('ls', 0.0))
            ps = evalu(m, tel, e.get('tta') or e.get('mv'))
            ta = accuracy_score([gt[str(i)] for i,_ in ps], [cn[p] for _,p in ps])
            ep_time = time.time() - ep_start; eta = ep_time * (EPOCHS - ep - 1); print(f"  [GPU{gpu_id}] Ep{ep+1}/{EPOCHS}: L={l:.4f}, Atr={a:.4f}, Ate={ta:.4f} | {ep_time//60:.0f}m{ep_time%60:.0f}s | ETA: {eta//60:.0f}m{eta%60:.0f}s")
        if e.get('two'):
            opt = torch.optim.AdamW(m.parameters(), lr=1e-6, weight_decay=WD)
            p2 = DataLoader(tds, BATCH, shuffle=True, num_workers=2, drop_last=True)
            sch = get_cosine_schedule_with_warmup(opt, 0, len(p2)*3//ACCUM)
            for ep in range(3):
                l,a = train(m, p2, opt, sch, sc, False, 0.1)
                ps = evalu(m, tel, e.get('tta'))
                ta = accuracy_score([gt[str(i)] for i,_ in ps], [cn[p] for _,p in ps])
                print(f"  P2Ep{ep+1}: L={l:.4f}, Atr={a:.4f}, Ate={ta:.4f}")
        ps = evalu(m, tel, e.get('tta') or e.get('mv'))
        fa = accuracy_score([gt[str(i)] for i,_ in ps], [cn[p] for _,p in ps])
        print(f"  FINAL: {fa:.4f}")
        res.append({'exp': e['name'], 'test_acc': fa, 'lr': lr, 'gpu': 1})
        # Save checkpoint after each exp
        pd.DataFrame(res).to_csv(f"results_gpu{gpu_id}_checkpoint.csv", index=False)
        print(f"  Checkpoint saved: {len(res)} experiments completed")
        del m; torch.cuda.empty_cache()
    pd.DataFrame(res).to_csv('results_gpu1.csv', index=False)
    print("Saved results_gpu1.csv")

if __name__ == '__main__': main()

In [None]:
## 2. Run Both Workers in Parallel (Real-time Output)
import subprocess
import sys
import threading
import time

def stream_output(proc, name):
    """Stream process output in real-time."""
    for line in iter(proc.stdout.readline, ''):
        if line:
            print(f"{line}", end='', flush=True)
    proc.stdout.close()

print("Starting 2 workers in parallel...")
print("="*60)

# Start both processes with stdout pipe
p0 = subprocess.Popen(
    ['python', '-u', 'worker_gpu0.py'],  # -u for unbuffered
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1
)
p1 = subprocess.Popen(
    ['python', '-u', 'worker_gpu1.py'],
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1
)

# Create threads to stream output
t0 = threading.Thread(target=stream_output, args=(p0, 'GPU0'))
t1 = threading.Thread(target=stream_output, args=(p1, 'GPU1'))

t0.start()
t1.start()

# Wait for completion
p0.wait()
p1.wait()
t0.join()
t1.join()

print("="*60)
print("Both workers finished!")


In [None]:
## 3. Merge Results and Plot
import pandas as pd
import matplotlib.pyplot as plt

# Load results
df0 = pd.read_csv('results_gpu0.csv')
df1 = pd.read_csv('results_gpu1.csv')
df = pd.concat([df0, df1]).sort_values('exp').reset_index(drop=True)

print("="*60)
print("ALL RESULTS")
print("="*60)
print(df.to_string(index=False))

# LR Comparison
print("\n--- LR Comparison ---")
lr_low = df[df['exp'] == 'Exp1_VideoMAE']['test_acc'].values[0]
lr_high = df[df['exp'] == 'Exp1b_LR_High']['test_acc'].values[0]
print(f"LR 5e-5:    {lr_low:.4f}")
print(f"LR 1.25e-4: {lr_high:.4f}")
print(f"Difference: {(lr_low - lr_high)*100:+.2f}%")

# Plot
fig, ax = plt.subplots(figsize=(14, 6))
colors = ['#e74c3c' if 'High' in e or '1.25' in str(l) else '#3498db' for e, l in zip(df['exp'], df['lr'])]
bars = ax.bar(range(len(df)), df['test_acc'] * 100, color=colors, edgecolor='black')

ax.set_xticks(range(len(df)))
ax.set_xticklabels([e.replace('Exp', '').replace('_', '\n') for e in df['exp']], fontsize=9)
ax.set_ylabel('Test Accuracy (%)', fontsize=12)
ax.set_title('VideoMAE Ablation Study (Blue=LR 5e-5, Red=LR 1.25e-4)', fontsize=14, fontweight='bold')

for bar, acc in zip(bars, df['test_acc']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{acc*100:.1f}%',
            ha='center', fontsize=9, fontweight='bold')

ax.set_ylim([50, 100])
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()