In [3]:
# === Cleaned & corrected notebook cell for Kaggle ===
# Run this inside a Kaggle kernel (the dataset is already mounted at /kaggle/input/rwf2000)

# 0. (Optional) Installs -- run only if you actually need them.
# !pip install --quiet timm==0.6.13 einops decord==0.6.2

import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# === 1. Paths (use the mounted dataset on Kaggle) ===
BASE = "/kaggle/input/rwf2000/RWF-2000"  
assert os.path.exists(BASE), f"Base path not found: {BASE}"

# === 2. Build dataframe from existing train/val dirs (you already have train/val) ===
rows = []
for split in ["train", "val"]:
    for cls in ["Fight", "NonFight"]:
        class_dir = os.path.join(BASE, split, cls)
        if not os.path.exists(class_dir):
            print("Missing folder:", class_dir)
            continue
        videos = glob.glob(os.path.join(class_dir, "*.avi")) + glob.glob(os.path.join(class_dir, "*.mp4"))
        for v in videos:
            rows.append({
                "clip_path": v,
                "label": "violence" if cls == "Fight" else "non_violence",
                "split": split
            })

df = pd.DataFrame(rows)
print("Total videos found:", len(df))
print(df['label'].value_counts())

# === 3. OPTIONAL: Create train/val/test splits if you prefer new splits ===
# If you want to keep the original train/val you can skip this section and create CSVs using df[df['split']=='train'] etc.
use_existing_splits = True

if not use_existing_splits:
    train_val, test = train_test_split(df, test_size=0.1, stratify=df["label"], random_state=42)
    train, val = train_test_split(train_val, test_size=0.12, stratify=train_val["label"], random_state=42)
else:
    train = df[df['split'] == 'train'].reset_index(drop=True)
    val = df[df['split'] == 'val'].reset_index(drop=True)
    # If there's no val in dataset and you want a test set, you can split train further.
    test = pd.DataFrame(columns=df.columns)

# Save CSVs to /kaggle/working (writable)
train_csv = "/kaggle/working/train.csv"
val_csv = "/kaggle/working/val.csv"
test_csv = "/kaggle/working/test.csv"
train.to_csv(train_csv, index=False)
val.to_csv(val_csv, index=False)
test.to_csv(test_csv, index=False)
print("Saved CSVs:", train_csv, val_csv, test_csv)
print("Counts ->", len(train), len(val), len(test))

# === 4. Frame extraction util (decord preferred, fallback to cv2) ===
import torch
from torch.utils.data import Dataset, DataLoader

try:
    from decord import VideoReader, cpu
    decord_available = True
    print("decord available")
except Exception as e:
    decord_available = False
    print("decord not available, will fallback to cv2:", e)

def sample_frames_decord(vr, num_frames):
    total = len(vr)
    if total == 0:
        return None
    idxs = np.linspace(0, total - 1, num_frames).astype(int)
    frames = vr.get_batch(idxs).asnumpy()  # (T,H,W,C)
    return frames

def extract_frames(path, num_frames=16, target_size=(224,224)):
    # Return (T,H,W,C) uint8 in RGB
    if decord_available:
        try:
            vr = VideoReader(path, ctx=cpu(0))
            frames = sample_frames_decord(vr, num_frames)
            if frames is None or len(frames) < num_frames:
                # pad by repeating last frame
                if frames is None or len(frames) == 0:
                    return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)
                last = frames[-1]
                pad = np.repeat(last[None, ...], num_frames - len(frames), axis=0)
                frames = np.concatenate([frames, pad], axis=0)
            # resize if needed (decord returns original sizes)
            import cv2
            resized = []
            for f in frames:
                resized.append(cv2.resize(f, target_size))
            return np.stack(resized, axis=0)
        except Exception as e:
            # fallback
            print("decord read error, falling back to cv2:", e)

    # cv2 fallback (safe)
    import cv2
    cap = cv2.VideoCapture(path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
    if total <= 0:
        cap.release()
        return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)
    idxs = np.linspace(0, total - 1, num_frames).astype(int)
    frames = []
    for idx in idxs:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, target_size)
        frames.append(frame)
    cap.release()
    if len(frames) < num_frames:
        if len(frames) == 0:
            return np.zeros((num_frames, target_size[0], target_size[1], 3), dtype=np.uint8)
        last = frames[-1]
        for _ in range(num_frames - len(frames)):
            frames.append(last.copy())
    return np.stack(frames, axis=0)

# === 5. Dataset and transforms ===
from torchvision import transforms
import torchvision.transforms.functional as TF

frames_per_clip = 16

class VideoTransform:
    def __init__(self, size=224):
        self.size = size
        self.resize = transforms.Resize((size, size))
    def __call__(self, frames_np):
        # frames_np: (T,H,W,C) uint8
        T = frames_np.shape[0]
        tensors = []
        for i in range(T):
            img = TF.to_pil_image(frames_np[i])
            img = TF.resize(img, [self.size, self.size])
            t = TF.to_tensor(img)  # C,H,W in [0,1]
            tensors.append(t)
        # stack into (C, T, H, W)
        frames = torch.stack(tensors, dim=1)
        mean = torch.tensor([0.485, 0.456, 0.406])[:, None, None, None]
        std = torch.tensor([0.229, 0.224, 0.225])[:, None, None, None]
        frames = (frames - mean) / std
        return frames

video_transform = VideoTransform(size=224)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(['non_violence', 'violence'])

class RWFVideoDataset(Dataset):
    def __init__(self, csv_file, transform=None, frames_per_clip=16):
        self.df = pd.read_csv(csv_file)
        self.paths = self.df['clip_path'].values
        self.labels = self.df['label'].values
        self.transform = transform
        self.frames_per_clip = frames_per_clip
        self.le = le
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        p = self.paths[idx]
        label = self.labels[idx]
        frames = extract_frames(p, num_frames=self.frames_per_clip, target_size=(224,224))
        frames = self.transform(frames)  # (C,T,H,W)
        label_idx = int(self.le.transform([label])[0])
        return frames, label_idx

# === 6. Model (use torchvision 3D model as backbone) ===
import torch.nn as nn
import torchvision
device = 'cuda' if torch.cuda.is_available() else 'cpu'

try:
    from torchvision.models.video import mc3_18
    backbone = mc3_18(pretrained=True)
    # mc3_18 expects input (B,3,T,H,W) and has backbone.fc
    backbone.fc = nn.Linear(backbone.fc.in_features, 2)
    model = backbone.to(device)
    print("Using mc3_18 backbone, device:", device)
except Exception as e:
    print("Could not load mc3_18 pretrained, falling back to tiny 3D conv:", e)
    class Simple3D(nn.Module):
        def __init__(self, num_classes=2):
            super().__init__()
            self.features = nn.Sequential(
                nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool3d((1,2,2)),
                nn.Conv3d(32, 64, kernel_size=(3,3,3), padding=1),
                nn.ReLU(),
                nn.AdaptiveAvgPool3d((1,1,1)),
            )
            self.head = nn.Linear(64, num_classes)
        def forward(self,x):
            x = self.features(x)
            x = x.view(x.size(0), -1)
            return self.head(x)
    model = Simple3D(num_classes=2).to(device)

# === 7. Training setup ===
from torch.optim import Adam
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scaler = torch.cuda.amp.GradScaler()

# Dataloaders
batch_size = 2   # lower this if OOM
num_workers = 2
train_dataset = RWFVideoDataset(train_csv, transform=video_transform, frames_per_clip=frames_per_clip)
val_dataset = RWFVideoDataset(val_csv, transform=video_transform, frames_per_clip=frames_per_clip)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=(device=='cuda'))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=(device=='cuda'))

# === 8. Training loop (simple, mixed precision) ===
def train_one_epoch(model, loader, optimizer, criterion, device, scaler, epoch):
    model.train()
    running_loss = 0.0
    total = 0
    correct = 0
    for i, (inputs, labels) in enumerate(loader):
        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        # torchvision 3D expects (B,C,T,H,W) - our transform produces that
        with torch.cuda.amp.autocast(enabled=(device=='cuda')):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        running_loss += loss.item()
        preds = outputs.argmax(dim=1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()
        if (i+1) % 50 == 0:
            print(f'Epoch {epoch} Iter {i+1}/{len(loader)} Loss {running_loss/(i+1):.4f} Acc {correct/total:.4f}')
    return running_loss / len(loader), correct / total

def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    total = 0
    correct = 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            preds = outputs.argmax(dim=1)
            total += labels.size(0)
            correct += (preds == labels).sum().item()
    return running_loss / len(loader), correct / total

# === 9. Run a short training run (set epochs small for test) ===
num_epochs = 5
best_val_acc = 0.0
for epoch in range(1, num_epochs+1):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device, scaler, epoch)
    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f'Epoch {epoch} Train Loss {train_loss:.4f} Acc {train_acc:.4f} | Val Loss {val_loss:.4f} Acc {val_acc:.4f}')
    # checkpoint (optional)
    torch.save({'epoch': epoch, 'model_state': model.state_dict(), 'optimizer': optimizer.state_dict()}, f'/kaggle/working/rwf_epoch{epoch}.pth')

print("Done.")

Total videos found: 2000
label
violence        1000
non_violence    1000
Name: count, dtype: int64
Saved CSVs: /kaggle/working/train.csv /kaggle/working/val.csv /kaggle/working/test.csv
Counts -> 1600 400 0
decord not available, will fallback to cv2: No module named 'decord'




Using mc3_18 backbone, device: cuda


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(enabled=(device=='cuda')):


Epoch 1 Iter 50/800 Loss 0.6433 Acc 0.6300
Epoch 1 Iter 100/800 Loss 0.6438 Acc 0.6550
Epoch 1 Iter 150/800 Loss 0.6440 Acc 0.6400
Epoch 1 Iter 200/800 Loss 0.6352 Acc 0.6525
Epoch 1 Iter 250/800 Loss 0.6305 Acc 0.6500
Epoch 1 Iter 300/800 Loss 0.6217 Acc 0.6583
Epoch 1 Iter 350/800 Loss 0.6188 Acc 0.6571
Epoch 1 Iter 400/800 Loss 0.5959 Acc 0.6725
Epoch 1 Iter 450/800 Loss 0.5929 Acc 0.6744
Epoch 1 Iter 500/800 Loss 0.5831 Acc 0.6800
Epoch 1 Iter 550/800 Loss 0.5797 Acc 0.6836
Epoch 1 Iter 600/800 Loss 0.5817 Acc 0.6850
Epoch 1 Iter 650/800 Loss 0.5797 Acc 0.6854
Epoch 1 Iter 700/800 Loss 0.5787 Acc 0.6850
Epoch 1 Iter 750/800 Loss 0.5762 Acc 0.6873
Epoch 1 Iter 800/800 Loss 0.5672 Acc 0.6956
Epoch 1 Train Loss 0.5672 Acc 0.6956 | Val Loss 0.3310 Acc 0.8550
Epoch 2 Iter 50/800 Loss 0.4438 Acc 0.7700
Epoch 2 Iter 100/800 Loss 0.4372 Acc 0.7800
Epoch 2 Iter 150/800 Loss 0.4107 Acc 0.8033
Epoch 2 Iter 200/800 Loss 0.4141 Acc 0.8025
Epoch 2 Iter 250/800 Loss 0.4073 Acc 0.8120
Epoch 2 Iter

In [1]:
import os
import pandas as pd
import glob
from sklearn.model_selection import train_test_split

BASE = "/kaggle/input/rwf2000/RWF-2000"

rows = []

for split in ["train", "val"]:
    split_dir = os.path.join(BASE, split)
    
    for class_name in ["Fight", "NonFight"]:
        class_dir = os.path.join(split_dir, class_name)

        # Match both AVI and MP4
        videos = glob.glob(os.path.join(class_dir, "*.avi")) + \
                 glob.glob(os.path.join(class_dir, "*.mp4"))
        
        for v in videos:
            rows.append({
                "clip_path": v,
                "label": "violence" if class_name == "Fight" else "non_violence"
            })

df = pd.DataFrame(rows)

print("Total videos:", len(df))
print(df.head())
print(df['label'].value_counts())

Total videos: 2000
                                           clip_path     label
0  /kaggle/input/rwf2000/RWF-2000/train/Fight/p1b...  violence
1  /kaggle/input/rwf2000/RWF-2000/train/Fight/gHc...  violence
2  /kaggle/input/rwf2000/RWF-2000/train/Fight/чЫС...  violence
3  /kaggle/input/rwf2000/RWF-2000/train/Fight/XRC...  violence
4  /kaggle/input/rwf2000/RWF-2000/train/Fight/4yT...  violence
label
violence        1000
non_violence    1000
Name: count, dtype: int64
