In [1]:
import os, glob
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision import models
from PIL import Image
import pandas as pd
from tqdm import tqdm
import numpy as np

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BASE_DIR = "/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset"
TRAIN_DIR = os.path.join(BASE_DIR, "training_videos")
TEST_DIR  = os.path.join(BASE_DIR, "testing_videos")
OUTPUT_PATH = "/kaggle/working/submission.csv"

# temporal settings
FRAME_STRIDE = 1
TEMPORAL_WINDOW = 2   # frame_t, frame_{t-1}

# training settings (used later)
BATCH_SIZE = 32
EPOCHS = 10
LR = 1e-3

# reproducibility
SEED = 42

IMG_SIZE = 224

# preprocessing flags
# USE_GRAYSCALE = True
USE_GRAYSCALE = False
FIX_ORIENTATION = True


In [3]:
IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1).to(DEVICE)
IMAGENET_STD  = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1).to(DEVICE)


In [4]:
import torchvision.models as models

resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
resnet.fc = torch.nn.Identity()   # remove classifier, keep features
resnet = resnet.to(DEVICE)
resnet.eval()

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 196MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [5]:
class FrameDataset(Dataset):
    def __init__(self, root_dir, img_size, use_grayscale=True, fix_orientation=True):
        self.img_size = img_size
        self.use_grayscale = use_grayscale
        self.fix_orientation = fix_orientation
        self.samples = []

        # collect frames: (video_id, frame_path)
        for video in sorted(os.listdir(root_dir)):
            video_path = os.path.join(root_dir, video)
            if not os.path.isdir(video_path):
                continue

            for fname in sorted(os.listdir(video_path)):
                if fname.endswith(".jpg"):
                    self.samples.append(
                        (video, os.path.join(video_path, fname))
                    )

    def __len__(self):
        return len(self.samples)

    def _fix_orientation(self, img):
        gray = np.array(img.convert("L"))
        top = gray[:50, :].mean()
        bottom = gray[-50:, :].mean()
        if top > bottom:
            img = img.rotate(180)
        return img

    def __getitem__(self, idx):
        video_id, path = self.samples[idx]
        # load image
        img = Image.open(path)

        # orientation normalization (dataset artifact fix)
        if self.fix_orientation:
            img = self._fix_orientation(img)

        # grayscale (safe)
        if self.use_grayscale:
            img = img.convert("L")

        # resize
        img = img.resize((self.img_size, self.img_size))

        # to tensor [0,1]
        img = torch.from_numpy(np.array(img)).float() / 255.0

        # add channel dimension
        if self.use_grayscale:
            img = img.unsqueeze(0)   # (1,H,W)
        else:
            img = img.permute(2, 0, 1)  # (C,H,W)

        # video_id: "01" → 1
        vid = int(video_id)

        # filename: "frame_00939.jpg" → 939
        frame_num = int(
                        os.path.basename(path)
                        .replace(".jpg", "")
                        .split("_")[-1]
                        )

        full_id = f"{vid}_{frame_num}"
        return img, vid, full_id


In [6]:
dataset = FrameDataset(
    root_dir=TEST_DIR,
    img_size=IMG_SIZE,
    use_grayscale=USE_GRAYSCALE,
    fix_orientation=FIX_ORIENTATION
)


In [7]:
train_dataset = FrameDataset(
    root_dir=TRAIN_DIR,
    img_size=IMG_SIZE,
    use_grayscale=False,        # ResNet expects 3 channels
    fix_orientation=True
)

features = []
vids = []

for img, vid, _ in tqdm(train_dataset):
    img = (img.to(DEVICE) - IMAGENET_MEAN) / IMAGENET_STD
    with torch.no_grad():
        feat = resnet(img.unsqueeze(0)).squeeze().cpu()
    features.append(feat)
    vids.append(vid)

features = torch.stack(features)
print("Feature shape:", features.shape)

100%|██████████| 9204/9204 [02:45<00:00, 55.74it/s]

Feature shape: torch.Size([9204, 512])





In [8]:
WINDOW = 10

seqs = []
for i in range(len(features) - WINDOW):
    seqs.append(features[i:i+WINDOW])

seqs = torch.stack(seqs)   # (N, T, 512)
print(seqs.shape)


torch.Size([9194, 10, 512])


In [9]:
class LSTMAutoEncoder(torch.nn.Module):
    def __init__(self, feat_dim=512, hidden=256):
        super().__init__()
        self.encoder = torch.nn.LSTM(feat_dim, hidden, batch_first=True)
        self.decoder = torch.nn.LSTM(hidden, feat_dim, batch_first=True)

    def forward(self, x):
        z, _ = self.encoder(x)
        out, _ = self.decoder(z)
        return out


In [10]:
ae = LSTMAutoEncoder().to(DEVICE)
opt = torch.optim.Adam(ae.parameters(), lr=1e-3)
loss_fn = torch.nn.MSELoss()

X = seqs.to(DEVICE)

for ep in range(15):
    opt.zero_grad()
    out = ae(X)
    loss = loss_fn(out, X)
    loss.backward()
    opt.step()
    print(f"Epoch {ep+1} | Loss {loss.item():.6f}")


Epoch 1 | Loss 0.913750
Epoch 2 | Loss 0.839692
Epoch 3 | Loss 0.741503
Epoch 4 | Loss 0.624366
Epoch 5 | Loss 0.527520
Epoch 6 | Loss 0.451166
Epoch 7 | Loss 0.392395
Epoch 8 | Loss 0.349518
Epoch 9 | Loss 0.320285
Epoch 10 | Loss 0.300700
Epoch 11 | Loss 0.287250
Epoch 12 | Loss 0.277754
Epoch 13 | Loss 0.271026
Epoch 14 | Loss 0.266220
Epoch 15 | Loss 0.262570


In [11]:
results = []
buffer = []
prev_vid = None

for img, vid, fid in dataset:

    img = (img.to(DEVICE) - IMAGENET_MEAN) / IMAGENET_STD
    with torch.no_grad():
        feat = resnet(img.unsqueeze(0)).squeeze().cpu()

    if prev_vid != vid:
        buffer = []

    buffer.append(feat)

    if len(buffer) < WINDOW:
        score = 0.0
    else:
        seq = torch.stack(buffer[-WINDOW:]).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            recon = ae(seq)
        score = torch.mean((recon.cpu() - seq.cpu())**2).item()

    results.append({"Id": fid, "Predicted": score})
    prev_vid = vid


In [12]:
df = pd.DataFrame(results)
df[['vid','frame']] = df['Id'].str.split('_', expand=True).astype(int)

df['Predicted'] = df.groupby('vid')['Predicted'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min() + 1e-6)
)

df = df[['Id','Predicted']]
df.to_csv("/kaggle/working/submission.csv", index=False)

print("Unique IDs:", df['Id'].nunique())


Unique IDs: 11706
