In [1]:
# Data Preprocessing (PHASE 2)

import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import random

# Configuration
IMG_SIZE = 128
AUGMENT = True
original_path = r"C:\Users\imran\OneDrive\Robotics Projects\Image Forgery\Image Data\TRAINING_CG-1050\TRAINING\ORIGINAL"
tampered_path = r"C:\Users\imran\OneDrive\Robotics Projects\Image Forgery\Image Data\TRAINING_CG-1050\TRAINING\TAMPERED"

# Loading image paths & labels
def load_image_paths(original_dir, tampered_dir):
    data = []

    original_images = [os.path.join(original_dir, f) for f in os.listdir(original_dir) if f.endswith('.jpg')]
    tampered_images = [os.path.join(tampered_dir, f) for f in os.listdir(tampered_dir) if f.endswith('.jpg')]

    for path in original_images:
        data.append((path, 0))
    for path in tampered_images:
        data.append((path, 1))
    
    return pd.DataFrame(data, columns=["image_path", "label"])

# Augmentation function
# These are random transformations to tampered images to improve generalization & help learn more robust features
def augment_image(img):
    # Brightness adjustment
    if random.random() > 0.5:
        hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
        hsv[:, :, 2] = np.clip(hsv[:, :, 2] * (0.5 + np.random.rand() * 1.5), 0, 255)
        img = cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)

    # Horizontal Flip
    if random.random() > 0.5:
        img = cv2.flip(img, 1)

    # Small angle rotation
    if random.random() > 0.5:
        angle = np.random.uniform(-15, 15)
        M = cv2.getRotationMatrix2D((IMG_SIZE / 2, IMG_SIZE / 2), angle, 1)
        img = cv2.warpAffine(img, M, (IMG_SIZE, IMG_SIZE))

    return img

# Preprocessing function
def preprocess_images(df, img_size=128, augment=False):
    images = []
    labels = []

    for row in tqdm(df.itertuples(), total=len(df)):
        img = cv2.imread(row.image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (img_size, img_size))

        if augment and row.label == 1:
            img = augment_image(img)

        img = img / 255.0
        images.append(img)
        labels.append(row.label)

    return np.array(images), np.array(labels)

# Loading the Dataset
train_df = load_image_paths(original_path, tampered_path)
print("Image paths loaded:", train_df.shape)

X, y = preprocess_images(train_df, IMG_SIZE, augment=AUGMENT)
print("Images preprocessed. Shape:", X.shape)

# Splitting the data into Training & Validation Sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")

# Saving the data in .npy format
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_val.npy", X_val)
np.save("y_val.npy", y_val)
print("All data is now saved to .npy files for future phases of the project.")


Image paths loaded: (1460, 2)


100%|██████████| 1460/1460 [00:05<00:00, 273.41it/s]


Images preprocessed. Shape: (1460, 128, 128, 3)
Training samples: 1168
Validation samples: 292
All data is now saved to .npy files for future phases of the project.
