# Loading Dataset

In [30]:
import os
import glob

# ✅ Define dataset path (UPDATE this with your dataset name)
dataset_path = "/kaggle/input/celebdf-v2"  # CHANGE THIS!

# ✅ Get correct paths
real_videos = glob.glob(os.path.join(dataset_path, "YouTube-real", "*.mp4")) + \
              glob.glob(os.path.join(dataset_path, "Celeb-real", "*.mp4"))
fake_videos = glob.glob(os.path.join(dataset_path, "Celeb-synthesis", "*.mp4"))

# ✅ Assign labels (0 = real, 1 = fake)
video_paths = real_videos + fake_videos
labels = [0] * len(real_videos) + [1] * len(fake_videos)

# ✅ Print summary
print(f"📊 Total Videos: {len(video_paths)} (Real: {len(real_videos)}, Fake: {len(fake_videos)})")


📊 Total Videos: 6529 (Real: 890, Fake: 5639)


# Checking Dataset Directory Structure

In [31]:
import os

for root, dirs, files in os.walk(dataset_path):
    print(f"📁 Directory: {root}")
    for file in files[:5]:  # Show first 5 files per folder
        print(f"  ├── {file}")


📁 Directory: /kaggle/input/celebdf-v2
  ├── List_of_testing_videos.txt
📁 Directory: /kaggle/input/celebdf-v2/YouTube-real
  ├── 00238.mp4
  ├── 00152.mp4
  ├── 00269.mp4
  ├── 00209.mp4
  ├── 00297.mp4
📁 Directory: /kaggle/input/celebdf-v2/Celeb-synthesis
  ├── id33_id20_0005.mp4
  ├── id24_id19_0004.mp4
  ├── id0_id3_0009.mp4
  ├── id51_id50_0008.mp4
  ├── id54_id53_0003.mp4
📁 Directory: /kaggle/input/celebdf-v2/Celeb-real
  ├── id49_0006.mp4
  ├── id27_0008.mp4
  ├── id32_0000.mp4
  ├── id1_0006.mp4
  ├── id50_0006.mp4


# Splitting Dataset into Train, Validation & Test

In [12]:
from sklearn.model_selection import train_test_split

# ✅ Split data into Train (80%), Validation (10%), Test (10%)
train_videos, test_videos, train_labels, test_labels = train_test_split(video_paths, labels, test_size=0.2, stratify=labels, random_state=42)
val_videos, test_videos, val_labels, test_labels = train_test_split(test_videos, test_labels, test_size=0.5, stratify=test_labels, random_state=42)

# ✅ Print dataset sizes
print(f"✅ Train: {len(train_videos)}, Validation: {len(val_videos)}, Test: {len(test_videos)}")


✅ Train: 5223, Validation: 653, Test: 653


# Segregating Real and Fake data to get a count

In [13]:
import numpy as np

print("📊 Train Set - Real:", np.sum(np.array(train_labels) == 0), "| Fake:", np.sum(np.array(train_labels) == 1))
print("📊 Validation Set - Real:", np.sum(np.array(val_labels) == 0), "| Fake:", np.sum(np.array(val_labels) == 1))
print("📊 Test Set - Real:", np.sum(np.array(test_labels) == 0), "| Fake:", np.sum(np.array(test_labels) == 1))


📊 Train Set - Real: 712 | Fake: 4511
📊 Validation Set - Real: 89 | Fake: 564
📊 Test Set - Real: 89 | Fake: 564


# Oversampling the Train Set

In [14]:
from imblearn.over_sampling import RandomOverSampler
import numpy as np

# Convert lists to numpy arrays
X = np.array(train_videos)
y = np.array(train_labels)

# Apply Oversampling
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X.reshape(-1, 1), y)

# Convert back to lists
train_videos = X_resampled.flatten().tolist()
train_labels = y_resampled.tolist()

# ✅ Print new dataset sizes
print(f"📊 New Train Set - Real: {train_labels.count(0)} | Fake: {train_labels.count(1)}")


📊 New Train Set - Real: 4511 | Fake: 4511


# Checking the Dataset Imbalance 2

In [15]:
print("📊 Final Train Set - Real:", np.sum(np.array(train_labels) == 0), "| Fake:", np.sum(np.array(train_labels) == 1))
print("📊 Validation Set - Real:", np.sum(np.array(val_labels) == 0), "| Fake:", np.sum(np.array(val_labels) == 1))
print("📊 Test Set - Real:", np.sum(np.array(test_labels) == 0), "| Fake:", np.sum(np.array(test_labels) == 1))


📊 Final Train Set - Real: 4511 | Fake: 4511
📊 Validation Set - Real: 89 | Fake: 564
📊 Test Set - Real: 89 | Fake: 564


# Video Frame Extraction Function

In [17]:
import cv2
import torch
from torch.utils.data import Dataset

class DeepfakeDataset(Dataset):
    def __init__(self, video_paths, labels, transform=None, num_frames=10):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform
        self.num_frames = num_frames

    def __len__(self):
        return len(self.video_paths)

    def extract_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        frame_indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)

        for idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            frames.append(frame)

        cap.release()
        return frames

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]

        frames = self.extract_frames(video_path)

        if self.transform:
            frames = [self.transform(image=frame)["image"] for frame in frames]

        frames_tensor = torch.stack(frames)  # Convert list of frames to tensor

        return frames_tensor, torch.tensor(label, dtype=torch.long)


# Creating Data Loader

In [33]:
from torch.utils.data import DataLoader

# ✅ Create dataset objects
train_dataset = DeepfakeDataset(train_videos, train_labels)
val_dataset = DeepfakeDataset(val_videos, val_labels)
test_dataset = DeepfakeDataset(test_videos, test_labels)

# ✅ Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# ✅ Print success message
print("✅ DataLoaders are ready!")


✅ DataLoaders are ready!


In [22]:
import pywt
print("✅ PyWavelets Installed Successfully!")


✅ PyWavelets Installed Successfully!


# Defining the Wavelet Transform Model

In [25]:
import pywt
import torch
import torch.nn as nn

class DWTFeatureExtractor(nn.Module):
    def __init__(self):
        super(DWTFeatureExtractor, self).__init__()

    def forward(self, x):
        batch_size, num_frames, C, H, W = x.shape  # (B, T, C, H, W)
        x = x.view(-1, C, H, W)  # Flatten across time dimension

        coeffs = [pywt.dwt2(frame.cpu().numpy(), 'haar') for frame in x]  # Apply DWT
        ll_coeffs = [torch.tensor(c[0]) for c in coeffs]  # Extract low-frequency LL coefficients
        ll_tensor = torch.stack(ll_coeffs).to(x.device)  # Convert back to tensor

        ll_tensor = ll_tensor.view(batch_size, num_frames, C, H//2, W//2)  # Reshape to match model input

        return ll_tensor

print("✅ DWTFeatureExtractor Defined!")


✅ DWTFeatureExtractor Defined!


In [26]:
import timm

class DeepfakeModel(nn.Module):
    def __init__(self, num_classes=2):
        super(DeepfakeModel, self).__init__()
        self.dwt = DWTFeatureExtractor()  # Wavelet feature extractor
        self.swin = timm.create_model("swin_tiny_patch4_window7_224", pretrained=True, num_classes=num_classes)

    def forward(self, x):
        x = self.dwt(x)  # Apply DWT feature extraction
        x = self.swin(x)  # Classify using Swin Transformer
        return x

# ✅ Initialize the model
model = DeepfakeModel()
print("✅ Model Defined!")


LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.

In [None]:
class DeepfakeModel(nn.Module):
    def __init__(self, num_classes=2):
        super(DeepfakeModel, self).__init__()
        self.dwt = DWTFeatureExtractor()
        self.swin = SwinTransformerDeepfake(num_classes)

    def forward(self, x):
        x = self.dwt(x)  # Apply DWT
        x = self.swin(x)  # Pass through Swin Transformer
        return x

# ✅ Initialize the full model
model = DeepfakeModel()
print(model)
