<a href="https://colab.research.google.com/github/Khai189/AIDetection/blob/main/AI_Non_AI_Images_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction: AI Image Detector

This is the EDA, model preperation, and experimentation behind my web extension for AI Image Detection.

I trained the model off a kaggle dataset, and the original model is from `torchvision.models.EfficientNetB0`, where you can find the documentation.

## Initial Data Setup

### Import the data from kaggle

In [None]:
import kagglehub
from pathlib import Path

# Download latest version
path = kagglehub.dataset_download("ayushmandatta1/deepdetect-2025")

print("Path to dataset files:", path)

In [None]:
source_path = path

destination_path = '/content/kaggle_dataset'

os.makedirs(destination_path, exist_ok=True)

# Get all items in the source directory
items_in_source = os.listdir(source_path)

print(f"Contents of the downloaded directory ({source_path}): {items_in_source}")

train_dir_processed = '/content/train'
test_dir_processed = '/content/test'

# Check if the processed data directories already exist
if os.path.exists(train_dir_processed) and os.path.exists(test_dir_processed) and os.listdir(train_dir_processed) != []:

  print(f"Dataset already exists, skipping data creation")

else:
  for item in items_in_source:
      s = os.path.join(source_path, item)
      d = os.path.join(destination_path, item)
      print(f"Copying {s} to {d}")

      if os.path.isdir(s):
          # For directories, use copytree. It requires the destination to not exist.
          if os.path.exists(d):
              print(f"Destination directory {d} already exists, removing it before copy.")
              shutil.rmtree(d)
          shutil.copytree(s, d)
      else:
          # For files, use copy2 (which preserves metadata)
          shutil.copy2(s, d)

  print(f"Dataset copied to: {destination_path}")


### Prepare the test and train directories (if needed)

In [None]:
import os

# Define base dirs
data_dir = '/content/kaggle_dataset/ddata'
train_dir = '/content/train'
test_dir = '/content/test'

if os.path.exists(train_dir_processed) and os.path.exists(test_dir_processed) and os.listdir(train_dir_processed) != []:
  print(f"Dataset already exists, skipping prep")
else:

  os.makedirs(train_dir, exist_ok=True)
  os.makedirs(test_dir, exist_ok=True)

  classes = ['fake', 'real']

  for cls in classes:
      os.makedirs(os.path.join(train_dir, cls), exist_ok=True)
      os.makedirs(os.path.join(test_dir, cls), exist_ok=True)

  print(f"Created training directories: {os.listdir(train_dir)}")
  print(f"Created testing directories: {os.listdir(test_dir)}")

### Split the testing and training data

In [None]:
import shutil
from sklearn.model_selection import train_test_split

test_size = 0.2

if os.path.exists(train_dir_processed) and os.path.exists(test_dir_processed) and os.listdir(train_dir_processed) != []:
  print(f"Dataset already exists, skipping prep")
else:
  for cls in classes:
      source_class_dir = os.path.join(data_dir, cls)

      all_files = [os.path.join(source_class_dir, f) for f in os.listdir(source_class_dir) if os.path.isfile(os.path.join(source_class_dir, f))]

      train_files, test_files = train_test_split(all_files, test_size=test_size, random_state=42)

      print(f"\nClass: {cls}")
      print(f"Total files: {len(all_files)}")
      print(f"Train files: {len(train_files)}")
      print(f"Test files: {len(test_files)}")

      dest_train_class_dir = os.path.join(train_dir, cls)
      for f_path in train_files:
          shutil.copy(f_path, dest_train_class_dir)

      dest_test_class_dir = os.path.join(test_dir, cls)
      for f_path in test_files:
          shutil.copy(f_path, dest_test_class_dir)


  # Verify counts
  print("\nVerification:")
  for cls in classes:
      print(f"Train {cls} count: {len(os.listdir(os.path.join(train_dir, cls)))}")
      print(f"Test {cls} count: {len(os.listdir(os.path.join(test_dir, cls)))}")

In [None]:
import torch
from torch import nn
from torchvision.transforms import transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

## Setting up DataLoaders and Datasets for our initial model

In [None]:
from torchvision import models
from pathlib import Path
import random

train_dir_abs = "/content/train"
test_dir_abs = "/content/test"

checkpoint_dir_train = os.path.join(train_dir_abs, '.ipynb_checkpoints')
checkpoint_dir_test = os.path.join(test_dir_abs, '.ipynb_checkpoints')

if os.path.exists(checkpoint_dir_train):
    shutil.rmtree(checkpoint_dir_train)
    print(f"Removed: {checkpoint_dir_train}")

if os.path.exists(checkpoint_dir_test):
    shutil.rmtree(checkpoint_dir_test)
    print(f"Removed: {checkpoint_dir_test}")

train_dir = Path(train_dir_abs)
test_dir = Path(test_dir_abs)

weights = models.EfficientNet_B0_Weights.DEFAULT
auto_transforms = weights.transforms()

try:
    train_data = ImageFolder(train_dir,
                             transform=auto_transforms)

    test_data = ImageFolder(test_dir,
                            transform=auto_transforms)
    print("ImageFolder initialization successful!")
except FileNotFoundError as e:
    print(f"Error initializing ImageFolder: {e}")
    print("Please ensure 'train' and 'test' directories contain 'FAKE' and 'REAL' subdirectories with image files.")



In [None]:
class_names = train_data.classes

In [None]:
len(train_data), len(test_data), class_names

In [None]:
BATCH_SIZE = 32
NUM_WORKERS = os.cpu_count()

train_dataloader = DataLoader(train_data,
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS,
                              shuffle=True,
                              pin_memory=True)

test_dataloader = DataLoader(test_data,
                              batch_size=BATCH_SIZE,
                              num_workers=NUM_WORKERS,
                              shuffle=False,
                              pin_memory=True)

In [None]:
image, label = next(iter(train_dataloader))

In [None]:
image.shape, label.shape

Let's take a look at an example from our data to visualize

In [None]:
import matplotlib.pyplot as plt

plt.imshow(image[0].cpu().permute(1, 2, 0))
plt.title(class_names[label[0]])

### Freeze the model and look at the parameters for our data

In [None]:
initial_model = models.efficientnet_b0(weights=weights).to(device)
for param in initial_model.features.parameters():
  param.requires_grad = False

In [None]:
initial_model.classifier

In [None]:
try:
  import torchinfo
except:
  !pip install torchinfo

from torchinfo import summary

In [None]:
summary(initial_model,
        input_size=(32, 3, 224, 224),
        col_names=["trainable", "num_params", "output_size"],
        col_width=20)

In [None]:
initial_model.classifier = nn.Sequential(
    nn.Dropout(p=.2, inplace=True),
    nn.Linear(in_features=1280, out_features=len(class_names))
)