# CPE695 Final Project for Team 1: <br>
**Group Members**: Ronald "Joey" Rupert, Andrew Greensweight, Michael Salek <br><br>
**Problem Statement:** <br>
The quality of AI-generated images has rapidly increased, leading to concerns of authenticity and trustworthiness. The aim of this project is to investigate whether computer vision techniques can effectively detect when images have been generated by AI. By addressing this problem, we can contribute to the development of algorithms that enhance the authenticity verification of images.
<br>
<br>
**Information on Dataset:** <br>
 https://www.kaggle.com/datasets/birdy654/cifake-real-and-ai-generated-synthetic-images
The dataset contains two classes - REAL and FAKE. For REAL, the images are collected from Krizhevsky & Hinton's CIFAR-10 dataset. For the FAKE images, they were generated to be the  equivalent of CIFAR-10 with Stable Diffusion version 1.4.There are 100,000 images for training (50k per class) and 20,000 for testing (10k per class).
<br>


# Content in this .ipynb file:


*   Loading in the data
*   Pre-Processing the data
*   One of three algorithms (K-NN Clustering)
*   Results of K-NN Clustering
<br>




# Loading in the Data
Author: Andrew Greensweight

In [3]:
import os
import cv2
import numpy as np
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from google.colab import drive

#Currently using 5k real and 5k fake images for the training set because it was too much data to upload to Colab
#The test set consists of 1.5k real images and 1.5k fake images
real_folder_0 = "/content/drive/MyDrive/CPE 695 - Summer 2023/Smaller Dataset for Final Project/real_0"
fake_folder_0 = "/content/drive/MyDrive/CPE 695 - Summer 2023/Smaller Dataset for Final Project/fake_0"

test_real_folder = "/content/drive/MyDrive/CPE 695 - Summer 2023/Smaller Dataset for Final Project/test_real_0"
test_fake_folder = "/content/drive/MyDrive/CPE 695 - Summer 2023/Smaller Dataset for Final Project/test_fake_0"

def load_images_from_folder(folder):
    images = []
    labels = []
    for filename in os.listdir(folder):
        if filename.endswith(".jpg"):
            image = cv2.imread(os.path.join(folder, filename))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
            images.append(image)
            labels.append("real" if "real" in folder else "fake")  # Assign labels based on the folder name
    return images, labels


# Load real images and labels
real_images, real_labels = load_images_from_folder(real_folder_0)

# Load fake images and labels
fake_images, fake_labels = load_images_from_folder(fake_folder_0)

# Combine real and fake images and labels
training_images = np.concatenate((real_images, fake_images), axis=0)
training_labels = np.concatenate((real_labels, fake_labels), axis=0)

# Set a random seed for reproducibility
random_seed = 42

# Shuffle the training set
np.random.seed(random_seed)
shuffle_indices = np.random.permutation(len(training_images))
training_images = training_images[shuffle_indices]
training_labels = training_labels[shuffle_indices]

# Combine the training set
combined_data = list(zip(training_images, training_labels))
#Tuples of images and labels
training_images, training_labels = zip(*combined_data)

#Convert the images and labels back into NumPy arrays for further processing
training_images = np.array(training_images)
training_labels = np.array(training_labels)

# Pre-Processing
Perform geometry augmentation shifts and use a Pre-trained CNN to extract features
<br>
Author: Andrew Greensweight<br>

In [4]:
!pip install tensorflow




In [5]:
from skimage.transform import resize
from sklearn.neighbors import KNeighborsClassifier
import tensorflow.keras.applications as keras_applications
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import accuracy_score

# Define image dimensions
image_size = (32, 32)

# Define the number of neighbors for KNN
n_neighbors = 5

# Define the geometry augmentation parameters
augmentation_params = {
    "flip": True,
    "shift_range": 0.2
}

def apply_geometry_augmentations(images, labels, augmentation_params):
    datagen = ImageDataGenerator(
        horizontal_flip=augmentation_params["flip"],
        width_shift_range=augmentation_params["shift_range"],
        height_shift_range=augmentation_params["shift_range"]
    )
    augmented_images = []
    augmented_labels = []
    for image, label in zip(images, labels):
        augmented_images.append(image)
        augmented_labels.append(label)
        if augmentation_params["flip"]:
            flipped_image = cv2.flip(image, 1)
            augmented_images.append(flipped_image)
            augmented_labels.append(label)
        if augmentation_params["shift_range"]:
            shifted_image = datagen.random_transform(image)
            augmented_images.append(shifted_image)
            augmented_labels.append(label)
    return augmented_images, augmented_labels



# Load real images and labels for the test set
test_real_images, test_real_labels = load_images_from_folder(test_real_folder)
test_fake_images, test_fake_labels = load_images_from_folder(test_fake_folder)

# Combine real and fake images and labels for the test set
test_images = np.concatenate((test_real_images, test_fake_images), axis=0)
test_labels = np.concatenate((test_real_labels, test_fake_labels), axis=0)

# Load pre-trained MobileNetV2 model (without the top classifier)
model = keras_applications.MobileNetV2(include_top=False, weights='imagenet', input_shape=(image_size[0], image_size[1], 3))

# Reshape the images to match the input shape of MobileNetV2
reshaped_training_images = []
for image in training_images:
    resized_image = resize(image, image_size)
    reshaped_training_images.append(resized_image)
reshaped_training_images = np.array(reshaped_training_images)

reshaped_test_images = []
for image in test_images:
    resized_image = resize(image, image_size)
    reshaped_test_images.append(resized_image)
reshaped_test_images = np.array(reshaped_test_images)

# Flatten the images to use as input features for the KNN classifier
X_train = reshaped_training_images.reshape(len(reshaped_training_images), -1)
y_train = training_labels
X_test = reshaped_test_images.reshape(len(reshaped_test_images), -1)
y_test = test_labels



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5


In [6]:
print("X_train:")
print(X_train[:10])
print("X_test:")
print(X_test[:10])

X_train:
[[0.11372549 0.08235294 0.         ... 0.23921569 0.21960784 0.        ]
 [1.         0.59215686 0.01568627 ... 0.21568627 0.1254902  0.09411765]
 [0.70588235 0.74509804 0.78039216 ... 0.38039216 0.44705882 0.50980392]
 ...
 [0.79215686 0.79215686 0.8        ... 0.48627451 0.48627451 0.49411765]
 [0.03137255 0.05490196 0.01568627 ... 0.37254902 0.27058824 0.14117647]
 [0.43137255 0.54901961 0.69803922 ... 0.29803922 0.39607843 0.42352941]]
X_test:
[[0.36470588 0.2627451  0.17254902 ... 0.52941176 0.41568627 0.28235294]
 [0.82352941 0.81176471 0.75294118 ... 0.43529412 0.54509804 0.38431373]
 [0.81960784 0.76470588 0.72941176 ... 0.58823529 0.65098039 0.54901961]
 ...
 [0.20392157 0.33333333 0.29803922 ... 0.20784314 0.25882353 0.15686275]
 [0.98823529 0.89803922 0.86666667 ... 0.97647059 0.85098039 0.70196078]
 [0.94901961 0.94117647 0.98431373 ... 0.08235294 0.09019608 0.07058824]]


# Create and fit the KNN classifier
Author: Andrew Greensweight

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid for grid search
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 31],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # For Minkowski distance: p=1 for Manhattan distance, p=2 for Euclidean distance
}

# Create the KNN classifier
knn = KNeighborsClassifier()

# Perform grid search
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

# Get the best parameters and accuracy
best_params = grid_search.best_params_
best_accuracy = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)

Best Parameters: {'n_neighbors': 31, 'p': 1, 'weights': 'uniform'}
Best Accuracy: 0.6860003336866379


# Parameter Tuning and Results
Now for using the best parameters determined by grid search on the test set.<br>
Author: Andrew Greensweight<br>

In [9]:
# Create the "best" KNN classifier
knn_best = KNeighborsClassifier(**best_params)

#Fit the new classifier
knn_best.fit(X_train, y_train)

#Predict the labels for the test set
y_pred = knn_best.predict(X_test)

#Calculate the accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.6903333333333334


Best accuracy using the test set is measured to be 69.03%

# Confusion Matrix
Author: Andrew Greensweight

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Define class labels
class_labels = np.unique(y_test)

# Create a DataFrame for the confusion matrix
df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)

# Display the confusion matrix as a table
print("Confusion Matrix:")
print(df_cm)

Confusion Matrix:
      fake  real
fake   913   587
real   342  1158
