# AI vs Human Art ML Project


Introduction: This notebook our submission for our Machine Learning final project. This project aims to use image classification to predict AI art or human art images.

### Step 1: Data Loading




In [2]:
import pathlib

# import all the required packages
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras
import pathlib
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from sklearn.metrics import classification_report,confusion_matrix
import random
import cv2
import os

import numpy as np

ModuleNotFoundError: No module named 'cv2'

#### Loading the Training Data:
The training data and test data must be loaded differently as the train folder has the typical structure of having subfolders for the 2 classes. The test folder doesn't have any subfolders so the images are not divided by their classes.

In [None]:
# Loading the training data
labels = ['AI_GENERATED', 'NON_AI_GENERATED']
img_size = 224

# File path to the training data
data_dir = pathlib.Path('data')

# Function to load the the data(training data)

def get_train_data()

In [None]:
# Rewriting the way the data is loaded
labels = ['AI_GENERATED', 'NON_AI_GENERATED']
img_size = 224
batch_size = 32

def get_file_paths(data_dir):
    """Get all file paths and labels without loading images"""
    file_paths = []
    file_labels = []

    for label in labels:
        path = os.path.join(data_dir, label)
        class_num = labels.index(label)

        for img in os.listdir(path):
            img_path = os.path.join(path, img)
            file_paths.append(img_path)
            file_labels.append(class_num)

    return file_paths, file_labels

def data_generator(file_paths, file_labels, batch_size=32, shuffle=True):
    """Generator that yields batches of images and labels"""
    indices = np.arange(len(file_paths))

    while True:  # Loop forever for Keras
        if shuffle:
            np.random.shuffle(indices)

        for start_idx in range(0, len(file_paths), batch_size):
            batch_indices = indices[start_idx:start_idx + batch_size]

            batch_images = []
            batch_labels = []

            for idx in batch_indices:
                try:
                    img_arr = cv2.imread(file_paths[idx])[...,::-1]  # BGR to RGB
                    if img_arr is None:
                        continue
                    resized_arr = cv2.resize(img_arr, (img_size, img_size))
                    normalized = resized_arr.astype(np.float32) / 255.0

                    batch_images.append(normalized)
                    batch_labels.append(file_labels[idx])
                except Exception as e:
                    print(f"Error loading {file_paths[idx]}: {e}")
                    continue

            if len(batch_images) > 0:
                yield np.array(batch_images), np.array(batch_labels)

def get_test_file_paths(data_dir):
    """Get all test file paths without loading images"""
    file_paths = []

    for img in os.listdir(data_dir):
        img_path = os.path.join(data_dir, img)
        if os.path.isfile(img_path):
            file_paths.append(img_path)

    return file_paths

def test_data_generator(file_paths, batch_size=32):
    """Generator for test data without labels"""
    for start_idx in range(0, len(file_paths), batch_size):
        batch_paths = file_paths[start_idx:start_idx + batch_size]
        batch_images = []

        for img_path in batch_paths:
            try:
                img_arr = cv2.imread(img_path)[...,::-1]  # BGR to RGB
                if img_arr is None:
                    continue
                resized_arr = cv2.resize(img_arr, (img_size, img_size))
                normalized = resized_arr.astype(np.float32) / 255.0
                batch_images.append(normalized)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")
                continue

        if len(batch_images) > 0:
            yield np.array(batch_images)

In [None]:
train_paths, train_labels = get_file_paths('data/train')
print(f"Total training images: {len(train_paths)}")

# Get test file paths
test_paths = get_test_file_paths('data/test')
print(f"Test samples: {len(test_paths)}")
train_gen = data_generator(train_paths, train_labels, batch_size=batch_size, shuffle=True)
test_gen = test_data_generator(test_paths, batch_size=batch_size)


# Step 2: Visualize the data

In [None]:
l = []
for label in train_labels:
    if(label == 0):
        l.append("AI_GENERATED")
    else:
        l.append("NON_AI_GENERATED")
sns.set_style('darkgrid')
sns.countplot(x=l)

Imbalanced data :(

In [None]:
import random

# Get one random AI image
ai_indices = [i for i, label in enumerate(train_labels) if label == 0]
ai_idx = random.choice(ai_indices)

img_ai = cv2.imread(train_paths[ai_idx])[...,::-1]
img_ai = cv2.resize(img_ai, (img_size, img_size))

# Display AI image
plt.figure(figsize=(10, 10))
plt.imshow(img_ai)
plt.title(labels[0])
plt.show()

# Get one random Real image
real_indices = [i for i, label in enumerate(train_labels) if label == 1]
real_idx = random.choice(real_indices)

img_real = cv2.imread(train_paths[real_idx])[...,::-1]
img_real = cv2.resize(img_real, (img_size, img_size))

# Display Real image
plt.figure(figsize=(10, 10))
plt.imshow(img_real)
plt.title(labels[1])

plt.show()

In [None]:
import random
import cv2
import matplotlib.pyplot as plt

def preview_images(file_paths, file_labels, num_images=5):
    """Preview random images from both classes"""

    # Separate indices by class
    ai_indices = [i for i, label in enumerate(file_labels) if label == 0]  # AI_GENERATED
    real_indices = [i for i, label in enumerate(file_labels) if label == 1]  # NON_AI_GENERATED

    print(f"Total AI Generated images: {len(ai_indices)}")
    print(f"Total Non-AI Generated images: {len(real_indices)}")

    # Sample random indices
    ai_sample_indices = random.sample(ai_indices, min(num_images, len(ai_indices)))
    real_sample_indices = random.sample(real_indices, min(num_images, len(real_indices)))

    # Create subplots
    fig, axes = plt.subplots(2, num_images, figsize=(15, 6))
    fig.suptitle('Image Preview: AI vs Non-AI Generated', fontsize=16)

    # Load and plot AI generated images
    for i, idx in enumerate(ai_sample_indices):
        img = cv2.imread(file_paths[idx])[...,::-1]  # BGR to RGB
        img = cv2.resize(img, (img_size, img_size))
        axes[0, i].imshow(img)
        axes[0, i].axis('off')
        if i == 0:
            axes[0, i].set_title('AI Generated', fontsize=12, fontweight='bold')

    # Load and plot Non-AI generated images
    for i, idx in enumerate(real_sample_indices):
        img = cv2.imread(file_paths[idx])[...,::-1]  # BGR to RGB
        img = cv2.resize(img, (img_size, img_size))
        axes[1, i].imshow(img)
        axes[1, i].axis('off')
        if i == 0:
            axes[1, i].set_title('Non-AI Generated', fontsize=12, fontweight='bold')

    plt.tight_layout()
    plt.show()

# Preview 5 images from each class
preview_images(train_paths, train_labels, num_images=5)

# Step 2: Data Preprocessing and Data Augmentation

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Split file paths into train and validation (80/20 split)
train_paths_split, val_paths, train_labels_split, val_labels = train_test_split(
    train_paths, train_labels,
    test_size=0.2,
    random_state=42,
    stratify=train_labels
)

print(f"Training samples: {len(train_paths_split)}")
print(f"Validation samples: {len(val_paths)}")
print(f"Test samples: {len(test_paths)}")

# ========== ADD DATA AUGMENTATION ==========
# Create augmentation object for training
augmentor = ImageDataGenerator(
    rotation_range=30,
    zoom_range=0.2,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=False,
    fill_mode='nearest'
)

# Modified generator WITH augmentation for training
def augmented_data_generator(file_paths, file_labels, batch_size=32, shuffle=True, augment=True):
    """Generator with optional data augmentation"""
    indices = np.arange(len(file_paths))

    while True:
        if shuffle:
            np.random.shuffle(indices)

        for start_idx in range(0, len(file_paths), batch_size):
            batch_indices = indices[start_idx:start_idx + batch_size]

            batch_images = []
            batch_labels = []

            for idx in batch_indices:
                try:
                    img_arr = cv2.imread(file_paths[idx])[...,::-1]  # BGR to RGB
                    if img_arr is None:
                        continue
                    resized_arr = cv2.resize(img_arr, (img_size, img_size))
                    normalized = resized_arr.astype(np.float32) / 255.0

                    batch_images.append(normalized)
                    batch_labels.append(file_labels[idx])
                except Exception as e:
                    print(f"Error loading {file_paths[idx]}: {e}")
                    continue

            if len(batch_images) > 0:
                batch_images = np.array(batch_images)
                batch_labels = np.array(batch_labels)

                # Apply augmentation only if requested
                if augment:
                    aug_gen = augmentor.flow(batch_images, batch_labels, batch_size=len(batch_images), shuffle=False)
                    batch_images, batch_labels = next(aug_gen)

                yield batch_images, batch_labels

# Create generators
batch_size = 32

# Training generator WITH augmentation
train_gen = augmented_data_generator(train_paths_split, train_labels_split, batch_size=batch_size, shuffle=True, augment=True)

# Validation generator WITHOUT augmentation
val_gen = augmented_data_generator(val_paths, val_labels, batch_size=batch_size, shuffle=False, augment=False)

# Test generator (no labels, no augmentation)
test_gen = test_data_generator(test_paths, batch_size=batch_size)

# Calculate steps for training
steps_per_epoch = len(train_paths_split) // batch_size
validation_steps = len(val_paths) // batch_size
test_steps = len(test_paths) // batch_size + 1  # +1 to include remaining images

print(f"\nSteps per epoch: {steps_per_epoch}")
print(f"Validation steps: {validation_steps}")
print(f"Test steps: {test_steps}")

# Step 3: Define the model

In [None]:
model = Sequential()
model.add(Conv2D(32, 3,padding="same", activation="relu", input_shape=(224,224,3)))
model.add(MaxPool2D())

model.add(Conv2D(32, 3, padding="same", activation="relu"))
model.add(MaxPool2D())

model.add(Conv2D(64, 3, padding="same", activation="relu"))
model.add(MaxPool2D())
model.add(Dropout(0.4))

model.add(Flatten())
model.add(Dense(128,activation="relu"))
model.add(Dense(2, activation="softmax"))

model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',  # For integer labels (0, 1)
    metrics=['accuracy']
)

In [None]:
# Train the model
history = model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_gen,
    validation_steps=validation_steps,
    epochs=20,
    verbose=1
)

# Step 4: Train the Model

# Step 5: Evaluating the Result

In [None]:
plt.figure(figsize=(14, 5))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.title('Model Accuracy Over Epochs', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, alpha=0.3)

In [None]:
# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', marker='o')
plt.plot(history.history['val_loss'], label='Validation Loss', marker='o')
plt.title('Model Loss Over Epochs', fontsize=14, fontweight='bold')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

In [None]:
# ========== 2. Print Final Metrics ==========
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]
final_train_loss = history.history['loss'][-1]
final_val_loss = history.history['val_loss'][-1]

print("\n" + "="*50)
print("FINAL TRAINING RESULTS")
print("="*50)
print(f"Training Accuracy:   {final_train_acc:.4f} ({final_train_acc*100:.2f}%)")
print(f"Validation Accuracy: {final_val_acc:.4f} ({final_val_acc*100:.2f}%)")
print(f"Training Loss:       {final_train_loss:.4f}")
print(f"Validation Loss:     {final_val_loss:.4f}")
print("="*50)

In [None]:
# ========== 3. Evaluate on Validation Set ==========
print("\nEvaluating on validation set...")
val_loss, val_accuracy = model.evaluate(val_gen, steps=validation_steps, verbose=1)
print(f"\nValidation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")


In [None]:
# ========== 4. Make Predictions on Test Set ==========
print("\nMaking predictions on test set...")
predictions = model.predict(test_gen, steps=test_steps, verbose=1)

# Get predicted classes
predicted_classes = np.argmax(predictions, axis=1)
confidence_scores = np.max(predictions, axis=1)

print(f"\nTotal test predictions: {len(predicted_classes)}")
print(f"Predicted as AI_GENERATED: {np.sum(predicted_classes == 0)} ({np.sum(predicted_classes == 0)/len(predicted_classes)*100:.1f}%)")
print(f"Predicted as NON_AI_GENERATED: {np.sum(predicted_classes == 1)} ({np.sum(predicted_classes == 1)/len(predicted_classes)*100:.1f}%)")
print(f"Average confidence: {np.mean(confidence_scores):.2%}")

In [None]:
# ========== 5. Show Sample Predictions ==========
print("\n" + "="*50)
print("SAMPLE TEST PREDICTIONS")
print("="*50)

# Get a few random test images
sample_indices = np.random.choice(len(test_paths), min(10, len(test_paths)), replace=False)

for idx in sample_indices:
    pred_class = predicted_classes[idx]
    confidence = confidence_scores[idx]
    print(f"\nImage: {os.path.basename(test_paths[idx])}")
    print(f"  Predicted: {labels[pred_class]}")
    print(f"  Confidence: {confidence:.2%}")


In [None]:
# ========== 5. Show Sample Predictions ==========
print("\n" + "="*50)
print("SAMPLE TEST PREDICTIONS")
print("="*50)

# Get a few random test images
sample_indices = np.random.choice(len(test_paths), min(10, len(test_paths)), replace=False)

for idx in sample_indices:
    pred_class = predicted_classes[idx]
    confidence = confidence_scores[idx]
    print(f"\nImage: {os.path.basename(test_paths[idx])}")
    print(f"  Predicted: {labels[pred_class]}")
    print(f"  Confidence: {confidence:.2%}")

plt.tight_layout()
plt.show()
