In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"jesonr","key":"3393b2ff613ba29c962fe66a3796e1df"}'}

In [2]:
# Move kaggle.json to correct directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Install Kaggle CLI
!pip install -q kaggle

In [3]:
# Download the Real vs Fake Faces dataset
!kaggle datasets download -d xhlulu/140k-real-and-fake-faces

Dataset URL: https://www.kaggle.com/datasets/xhlulu/140k-real-and-fake-faces
License(s): other
Downloading 140k-real-and-fake-faces.zip to /content
 98% 3.68G/3.75G [00:35<00:01, 39.9MB/s]
100% 3.75G/3.75G [00:35<00:00, 114MB/s] 


In [4]:
# Unzip the dataset
!unzip -q 140k-real-and-fake-faces.zip

In [7]:
import os

real_test = len(os.listdir('/content/real_vs_fake/real-vs-fake/test/real'))
fake_test = len(os.listdir('/content/real_vs_fake/real-vs-fake/test/fake'))

real_valid = len(os.listdir('/content/real_vs_fake/real-vs-fake/valid/real'))
fake_valid = len(os.listdir('/content/real_vs_fake/real-vs-fake/valid/fake'))

real_test = len(os.listdir('/content/real_vs_fake/real-vs-fake/train/real'))
fake_test = len(os.listdir('/content/real_vs_fake/real-vs-fake/train/fake'))

print(f"Real test images: {real_test}")
print(f"Fake test images: {fake_test}")

print(f"Real valid images: {real_valid}")
print(f"Fake valid images: {fake_valid}")

print(f"Real train images: {real_test}")
print(f"Fake train images: {fake_test}")


Real test images: 50000
Fake test images: 50000
Real valid images: 10000
Fake valid images: 10000
Real train images: 50000
Fake train images: 50000


In [35]:
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torchvision import datasets, transforms
import torch
import torch.nn as nn
from torchvision.utils import make_grid
from torchvision.utils import save_image
from IPython.display import Image
import matplotlib.pyplot as plt
import numpy as np
import random
from PIL import Image
%matplotlib inline

In [36]:
class SampledImageDataset(Dataset):
    def __init__(self, image_paths, label, transform=None):
        self.image_paths = image_paths
        self.label = label
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, self.label


In [38]:
import glob

def create_loader_by_fixed_count(data_root, subset='train', samples_per_class=250, transform=None):
    real_dir = os.path.join(data_root, subset, 'real')
    fake_dir = os.path.join(data_root, subset, 'fake')

    real_paths = sorted(glob.glob(os.path.join(real_dir, '*')))[:samples_per_class]
    fake_paths = sorted(glob.glob(os.path.join(fake_dir, '*')))[:samples_per_class]

    real_dataset = SampledImageDataset(real_paths, label=0, transform=transform)
    fake_dataset = SampledImageDataset(fake_paths, label=1, transform=transform)

    full_dataset = ConcatDataset([real_dataset, fake_dataset])
    loader = DataLoader(full_dataset, batch_size=32, shuffle=True)
    return loader


In [39]:
data_directory = '/content/real_vs_fake/real-vs-fake'

transform = transforms.Compose([
    transforms.Resize((128,128)),
    transforms.ToTensor(),
])

In [40]:
# Creating datasets and dataloaders
from torch.utils.data import Subset

# train_dataset = datasets.ImageFolder(root=f'{data_directory}/train', transform=transform)
# valid_dataset = datasets.ImageFolder(root=f'{data_directory}/valid', transform=transform)
# test_dataset = datasets.ImageFolder(root=f'{data_directory}/test', transform=transform)

# small_train = get_balanced_subset(train_dataset, 500)
# small_valid = get_balanced_subset(valid_dataset, 100)
# small_test = get_balanced_subset(test_dataset, 500)

train_loader = create_loader_by_fixed_count(data_directory, 'train', 250, transform)
valid_loader = create_loader_by_fixed_count(data_directory, 'valid', 50, transform)
test_loader = create_loader_by_fixed_count(data_directory, 'test', 250, transform)

In [41]:
# Define a basic CNN model
class SimpleCNN(nn.Module):
  def __init__(self):
    super(SimpleCNN, self).__init__()
    self.net = nn.Sequential(
        nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),

        nn.Flatten(),
        nn.Linear(64 * 32 * 32, 128),
        nn.ReLU(),
        nn.Linear(128, 2)
    )

  def forward(self, x):
    return self.net(x)

In [42]:
# Training the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [43]:
def train(model, train_loader, criterion, optimizer, device):
  model.train()
  total_loss, correct, total = 0,0,0

  for imgs, labels in train_loader:
    imgs, labels = imgs.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(imgs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    _, predicted = torch.max(outputs.data, 1)
    correct += (predicted == labels).sum().item()
    total += labels.size(0)

  return total_loss/len(train_loader), correct/total # average loss and accuracy

In [44]:
# Define the validation function
def evaluate(model, valid_loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0,0,0
    with torch.no_grad():
        for imgs, labels in valid_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct/total

In [45]:
# Training the model
epochs = 10
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, criterion, optimizer, device)
  valid_acc = evaluate(model, valid_loader, criterion, device)
  print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")

Epoch 1 | Train Loss: 0.8156 | Train Acc: 0.5020
Epoch 2 | Train Loss: 0.6618 | Train Acc: 0.5920
Epoch 3 | Train Loss: 0.6423 | Train Acc: 0.6320
Epoch 4 | Train Loss: 0.5749 | Train Acc: 0.7500
Epoch 5 | Train Loss: 0.4921 | Train Acc: 0.7260
Epoch 6 | Train Loss: 0.4498 | Train Acc: 0.8000
Epoch 7 | Train Loss: 0.3297 | Train Acc: 0.8680
Epoch 8 | Train Loss: 0.2693 | Train Acc: 0.8920
Epoch 9 | Train Loss: 0.2075 | Train Acc: 0.9280
Epoch 10 | Train Loss: 0.1274 | Train Acc: 0.9620


In [46]:
# Saving model weights
torch.save(model.state_dict(), 'deepfake_cnn_weights.pth')

In [47]:
# Saving the entire model
torch.save(model, "deepfake_cnn_model.pth")

In [48]:
# Saving metadata
import json

metadata = {
    "model_name": "SimpleCNN",
    "input_size": [128, 128],
    "class_names": ["real", "fake"],
    "val_accuracy": 0.93
}

with open("model_meta.json", "w") as f:
    json.dump(metadata, f)
