In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.io import read_image
from vit_pytorch import ViT



In [3]:
# Check if GPU is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define a custom dataset class
class ImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        # Load all the image paths and labels
        for label_type in ['real', 'fake']:
            label_dir = os.path.join(root_dir, label_type)
            image_files = os.listdir(label_dir)
            self.image_paths += [os.path.join(label_dir, img_file) for img_file in image_files]
            self.labels += [1 if label_type == 'real' else 0] * len(image_files)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = read_image(img_path)
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label



In [4]:
# Define the image transformations
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load the datasets
train_dataset = ImageDataset(root_dir='train', transform=transform)
dev_dataset = ImageDataset(root_dir='dev', transform=transform)
test_dataset = ImageDataset(root_dir='test', transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [5]:
from vit_pytorch import ViT
from torchvision import transforms

# Define more aggressive data augmentation
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1), shear=10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Update the train dataset with the new transformations
train_dataset = ImageDataset(root_dir='train', transform=train_transform)

# Initialize the model with increased dropout
import torch
import torch.nn as nn
from torchvision.models import resnet18
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class ConvolutionalTransformer(nn.Module):
    def __init__(self, image_size, num_classes, dim, depth, heads, mlp_dim, dropout):
        super(ConvolutionalTransformer, self).__init__()

        # Using a pre-trained ResNet18 model for convolutional feature extraction
        self.conv_features = resnet18(pretrained=True)
        self.conv_features = nn.Sequential(*list(self.conv_features.children())[:-2]) # Removing last two layers

        # Calculate the size of the feature map
        feature_map_size = image_size // 32  # ResNet18 has a 32x downscaling
        num_patches = feature_map_size * feature_map_size

        # Transformer Encoder
        transformer_layer = TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout)
        self.transformer_encoder = TransformerEncoder(transformer_layer, num_layers=depth)

        # Classifier Head
        self.fc = nn.Linear(num_patches * dim, num_classes)

    def forward(self, x):
        # Convolutional layers
        x = self.conv_features(x)  # Shape: [batch_size, 512, feature_map_size, feature_map_size]

        # Flatten and rearrange for transformer
        x = x.flatten(2)  # Shape: [batch_size, 512, num_patches]
        x = x.transpose(1, 2)  # Shape: [batch_size, num_patches, 512]

        # Transformer Encoder
        x = self.transformer_encoder(x)

        # Classifier
        x = x.flatten(1)  # Flatten the output for the linear layer
        x = self.fc(x)

        return x

# Create model instance
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ConvolutionalTransformer(
    image_size = 224,
    num_classes = 2,
    dim = 512,             # Adjust the dimensions as needed
    depth = 4,             # Number of transformer layers
    heads = 8,
    mlp_dim = 2048,
    dropout = 0.5
).to(device)


# Define the optimizer with increased weight decay
optimizer = torch.optim.Adam(model.parameters(), lr=0.05, weight_decay=5e-4)  # Increased weight decay for L2 regularization

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()


# Updated training function with checkpointing
def train_model_with_checkpointing(train_loader, dev_loader, model, criterion, optimizer, num_epochs=25, checkpoint_path='model_checkpoint4.pth'):
    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0
        
        # Iterate over the training data
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct_preds / total_preds
        
        # Validation
        model.eval()
        dev_loss = 0.0
        dev_correct_preds = 0
        with torch.no_grad():
            for inputs, labels in dev_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                dev_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs, 1)
                dev_correct_preds += (predicted == labels).sum().item()

        dev_loss = dev_loss / len(dev_loader.dataset)
        dev_acc = dev_correct_preds / len(dev_loader.dataset)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
        print(f'Dev Loss: {dev_loss:.4f} Acc: {dev_acc:.4f}')

        # Checkpointing
        if dev_acc > best_acc:
            best_acc = dev_acc
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Checkpoint saved at epoch {epoch+1} with dev accuracy: {dev_acc:.4f}")







In [6]:
# Train the model and save checkpoints
train_model_with_checkpointing(train_loader, dev_loader, model, criterion, optimizer, num_epochs=15, checkpoint_path='model_checkpoint5.pth')


Epoch 1/15
Train Loss: 9.6685 Acc: 0.6594
Dev Loss: 1.1016 Acc: 0.2205
Checkpoint saved at epoch 1 with dev accuracy: 0.2205
Epoch 2/15
Train Loss: 1.0863 Acc: 0.6778
Dev Loss: 0.6898 Acc: 0.7795
Checkpoint saved at epoch 2 with dev accuracy: 0.7795
Epoch 3/15
Train Loss: 1.9020 Acc: 0.6615
Dev Loss: 2.1661 Acc: 0.7795
Epoch 4/15
Train Loss: 2.9943 Acc: 0.6579
Dev Loss: 1.9294 Acc: 0.7795
Epoch 5/15
Train Loss: 2.7031 Acc: 0.6568
Dev Loss: 0.8305 Acc: 0.7795
Epoch 6/15
Train Loss: 2.1277 Acc: 0.6643
Dev Loss: 1.4534 Acc: 0.7795
Epoch 7/15
Train Loss: 2.0387 Acc: 0.6655
Dev Loss: 1.2025 Acc: 0.7795
Epoch 8/15
Train Loss: 3.8866 Acc: 0.6686
Dev Loss: 1.4998 Acc: 0.7795
Epoch 9/15
Train Loss: 13.5897 Acc: 0.6665
Dev Loss: 0.5388 Acc: 0.7795
Epoch 10/15
Train Loss: 0.7042 Acc: 0.7191
Dev Loss: 0.5387 Acc: 0.7795
Epoch 11/15
Train Loss: 3.3911 Acc: 0.7072
Dev Loss: 13.2645 Acc: 0.7795
Epoch 12/15
Train Loss: 1.9501 Acc: 0.6716
Dev Loss: 1.1597 Acc: 0.7795
Epoch 13/15
Train Loss: 1.6219 Acc:

In [6]:
# Function to calculate the accuracy on the test set
def calculate_test_accuracy(test_loader, model):
    model.eval()  # Set the model to evaluation mode
    correct_preds = 0
    total_preds = 0

    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

    accuracy = correct_preds / total_preds
    return accuracy

# Load the best model weight
model.load_state_dict(torch.load('model_checkpoint5.pth'))

# Calculate accuracy on the test set
test_accuracy = calculate_test_accuracy(test_loader, model)
print(f'Test Set Accuracy: {test_accuracy:.4f}')


Test Set Accuracy: 0.7875
