<a href="https://colab.research.google.com/github/SaahilShaikh17/Forgery-Detection-Hackrx/blob/main/hackrx_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import os
import numpy as np

from google.colab import drive

In [None]:
# Mount the Google Drive
drive.mount('/content/drive')

# Define paths for genuine and forged folders in your Google Drive
base_dir = '/content/drive/My Drive/Bajaj-HackRX/datasets'


forged_dir = os.path.join(base_dir, 'forged')
genuine_dir = os.path.join(base_dir, 'genuine')

preprocessed_forged_dir = os.path.join(base_dir, 'preprocessed_forged')
preprocessed_genuine_dir = os.path.join(base_dir, 'preprocessed_genuine')

augmented_forged_dir = os.path.join(base_dir, 'augmented_forged')
augmented_genuine_dir = os.path.join(base_dir, 'augmented_genuine')

# Create base output directories if they don't exist
os.makedirs(preprocessed_forged_dir, exist_ok=True)
os.makedirs(preprocessed_genuine_dir, exist_ok=True)
os.makedirs(augmented_forged_dir, exist_ok=True) # Augemented forged dir
os.makedirs(augmented_genuine_dir, exist_ok=True) # Augemented genuine dir

Mounted at /content/drive


In [None]:
# Parameters for resizing
resize_dims = (2048, 2048)
'''
using 2048x2048 so that all images have the same dimensions, making it easier
for further preprocessing and model training
'''

'\nusing 2048x2048 so that all images have the same dimensions, making it easier\nfor further preprocessing and model training\n'

In [None]:
# Function to preprocess an image
def preprocess_image(image_path):
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)

    if image is None:
        print(f"Failed to load image {image_path}")
        return None

    # Rescaling: Resize the image
    image_resized = cv2.resize(image, resize_dims, interpolation=cv2.INTER_LINEAR)

    # Denoising: Use Bilateral filtering (you can change to Gaussian or Median if needed)
    image_denoised = cv2.bilateralFilter(image_resized, d=9, sigmaColor=75, sigmaSpace=75)

    ''' The commented code is to denoise and adjust the contrast the image '''
    # # Contrast Adjustment: Histogram Equalization (convert to grayscale first)
    # image_gray = cv2.cvtColor(image_denoised, cv2.COLOR_BGR2GRAY)
    # image_equalized = cv2.equalizeHist(image_gray)

    # Convert back to BGR format
    # image_preprocessed = cv2.cvtColor(image_equalized, cv2.COLOR_GRAY2BGR)

    return image_denoised

In [None]:
# Function to process images in a directory and save to corresponding output folder
def process_images(input_dir, output_dir):
    for root, dirs, files in os.walk(input_dir):
        # Create corresponding output directories
        rel_path = os.path.relpath(root, input_dir)  # Relative path from input_dir
        output_subdir = os.path.join(output_dir, rel_path)  # Corresponding subdir in output_dir
        os.makedirs(output_subdir, exist_ok=True)  # Create the subdir if it doesn't exist

        for file in files:
            if file.endswith(('.tif', '.png')):  # Only process .tif and .png images
                input_image_path = os.path.join(root, file)

                # Modify filename by appending 'p' before the extension
                file_name, file_ext = os.path.splitext(file)
                new_file_name = f"{file_name}p{file_ext}"

                output_image_path = os.path.join(output_subdir, new_file_name)

                # Preprocess and save the image
                preprocessed_image = preprocess_image(input_image_path)
                if preprocessed_image is not None:
                    cv2.imwrite(output_image_path, preprocessed_image)
                    print(f"Saved preprocessed image: {output_image_path}")

# Process forged images and save in preprocessed_forged with the same subfolder structure
process_images(forged_dir, preprocessed_forged_dir)

# Process genuine images and save in preprocessed_genuine with the same subfolder structure
process_images(genuine_dir, preprocessed_genuine_dir)

# <b> Forged Augmentation

In [None]:
# import os
# from PIL import Image
# import numpy as np

# def load_images_from_folder(folder_path):
#     images = []
#     for filename in os.listdir(folder_path):
#         img_path = os.path.join(folder_path, filename)
#         img = Image.open(img_path)
#         images.append(np.array(img))
#     return images

# preprocessed_images = load_images_from_folder('https://drive.google.com/drive/folders/1N-CVvCMGkkW4iA5qjo9hT3gFpq9Ala1H?usp=drive_link')


##Set Up Data Augmentation

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

##Apply Augmentation and Save Images

In [None]:
# # Create output directory if it doesn't exist
# def augment_preprocessed_forged(input_dir, output_dir):
#   # if not os.path.exists(output_folder):
#   #     os.makedirs(output_folder)
#   datagen = ImageDataGenerator(
#         rotation_range=40,
#         width_shift_range=0.2,
#         height_shift_range=0.2,
#         shear_range=0.2,
#         zoom_range=0.2,
#         horizontal_flip=True,
#         fill_mode='nearest'
#     )

#   for root, dirs, files in os.walk(input_dir):
#     rel_path = os.path.relpath(root, input_dir)  # Relative path from input_dir
#     output_subdir = os.path.join(output_dir, rel_path)  # Corresponding subdir in output_dir
#     os.makedirs(output_subdir, exist_ok=True)  # Create the subdir if it doesn't exist

#   for file in files:
#     file_path = os.path.join(root, file)
#     img = Image.open(file_path)
#     img_array = np.array(img)
#     img_array = np.expand_dims(img_array, 0)  # Add batch dimension

#     # Apply augmentation and save images
#     for batch in datagen.flow(img_array, batch_size=1):
#         augmented_image = batch[0].astype('uint8')  # Convert back to uint8
#         augmented_img = Image.fromarray(augmented_image)

#         # Save augmented image
#         augmented_img_path = os.path.join(output_subdir, f'augmented_{file}')
#         augmented_img.save(augmented_img_path)
#         break  # Stop after saving one augmented image per original image

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import numpy as np
import os

def augment_preprocessed_forged(input_dir, output_dir):
    datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    for root, dirs, files in os.walk(input_dir):
        rel_path = os.path.relpath(root, input_dir)  # Relative path from input_dir
        output_subdir = os.path.join(output_dir, rel_path)  # Corresponding subdir in output_dir
        os.makedirs(output_subdir, exist_ok=True)  # Create the subdir if it doesn't exist

        for file in files:
            file_path = os.path.join(root, file)
            img = Image.open(file_path)
            img_array = np.array(img)
            img_array = np.expand_dims(img_array, 0)  # Add batch dimension

            # Apply augmentation and save images
            for batch in datagen.flow(img_array, batch_size=1):
                augmented_image = batch[0].astype('uint8')  # Convert back to uint8
                augmented_img = Image.fromarray(augmented_image)

                # Save augmented image
                augmented_img_path = os.path.join(output_subdir, f'augmented_{file}')
                augmented_img.save(augmented_img_path)
                break  # Stop after saving one augmented image per original image

# Define your directories
# base_dir = '/content/drive/My Drive/Bajaj-HackRX/datasets'
input_dir = os.path.join(base_dir, 'preprocessed_forged')
augmented_forged_dir = os.path.join(base_dir, 'augmented_forged')

# Call the function
augment_preprocessed_forged(input_dir, augmented_forged_dir)


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from PIL import Image
import numpy as np
import os

def augment_preprocessed_forged(input_dir, output_dir):
    datagen = ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    for root, dirs, files in os.walk(input_dir):
        rel_path = os.path.relpath(root, input_dir)  # Relative path from input_dir
        output_subdir = os.path.join(output_dir, rel_path)  # Corresponding subdir in output_dir
        os.makedirs(output_subdir, exist_ok=True)  # Create the subdir if it doesn't exist

        for file in files:
            file_path = os.path.join(root, file)
            img = Image.open(file_path)
            img_array = np.array(img)
            img_array = np.expand_dims(img_array, 0)  # Add batch dimension

            # Apply augmentation and save images
            for batch in datagen.flow(img_array, batch_size=1):
                augmented_image = batch[0].astype('uint8')  # Convert back to uint8
                augmented_img = Image.fromarray(augmented_image)

                # Save augmented image
                augmented_img_path = os.path.join(output_subdir, f'augmented_{file}')
                augmented_img.save(augmented_img_path)
                break  # Stop after saving one augmented image per original image

# Define your directories
# base_dir = '/content/drive/My Drive/Bajaj-HackRX/datasets'
input_dir_genuine = os.path.join(base_dir, 'preprocessed_genuine')
augmented_genuine_dir = os.path.join(base_dir, 'augmented_genuine')

# Call the function
augment_preprocessed_forged(input_dir_genuine, augmented_genuine_dir)


# <b> Train, Test split

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Image data generator with 70-30 train-test split
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.3)  # Normalize images

# Load training set (70%)
train_generator = datagen.flow_from_directory(
    base_dir,
    target_size=(224, 224),  # Resize images if needed
    batch_size=32,
    class_mode='categorical',  # Use 'binary' if you have 2 classes
    subset='training'
)

# Load test set (30%)
test_generator = datagen.flow_from_directory(
    base_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)


Found 177 images belonging to 8 classes.
Found 75 images belonging to 8 classes.


In [None]:
from tensorflow.keras import layers, models

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(train_generator.num_classes, activation='softmax')  # For multi-class classification
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # Use binary_crossentropy for binary classification
              metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
history = model.fit(
    train_generator,
    epochs=20,
    validation_data=test_generator
)


Epoch 1/20


  self._warn_if_super_not_called()


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 12s/step - accuracy: 0.4599 - loss: 3.5252 - val_accuracy: 0.7067 - val_loss: 1.0407
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 5s/step - accuracy: 0.6534 - loss: 1.1138 - val_accuracy: 0.7067 - val_loss: 1.1596
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5s/step - accuracy: 0.6819 - loss: 1.1560 - val_accuracy: 0.7067 - val_loss: 1.0136
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 5s/step - accuracy: 0.6837 - loss: 0.9733 - val_accuracy: 0.7067 - val_loss: 0.9617
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 5s/step - accuracy: 0.7250 - loss: 0.7339 - val_accuracy: 0.7067 - val_loss: 1.1176
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 5s/step - accuracy: 0.7062 - loss: 0.6838 - val_accuracy: 0.5600 - val_loss: 1.1928
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test accuracy: {test_acc}")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.6047 - loss: 0.9521
Test accuracy: 0.6000000238418579


In [None]:
from tensorflow.keras import regularizers

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    # Add dropout to prevent overfitting
    layers.Dropout(0.5),

    layers.Flatten(),

    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)),  # L2 regularization
    layers.Dropout(0.5),
    layers.Dense(train_generator.num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    train_generator,
    epochs=50,
    validation_data=test_generator,
    callbacks=[early_stopping]
)

Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3s/step - accuracy: 0.3849 - loss: 4.4068 - val_accuracy: 0.7067 - val_loss: 1.3376
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 894ms/step - accuracy: 0.7066 - loss: 1.3061 - val_accuracy: 0.7067 - val_loss: 1.1992
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 864ms/step - accuracy: 0.6763 - loss: 1.2420 - val_accuracy: 0.7067 - val_loss: 1.2383
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 876ms/step - accuracy: 0.6490 - loss: 1.1830 - val_accuracy: 0.7067 - val_loss: 1.2853
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 1s/step - accuracy: 0.6898 - loss: 0.9635 - val_accuracy: 0.7067 - val_loss: 1.3515
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 929ms/step - accuracy: 0.6719 - loss: 0.9435 - val_accuracy: 0.7067 - val_loss: 1.3304
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━

In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test accuracy: {test_acc}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - accuracy: 0.7205 - loss: 1.1876
Test accuracy: 0.7066666483879089


In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=0.00001)

history = model.fit(
    train_generator,
    epochs=50,
    validation_data=test_generator,
    callbacks=[early_stopping, lr_scheduler]
)

Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.6785 - loss: 1.2255 - val_accuracy: 0.7067 - val_loss: 1.1351 - learning_rate: 0.0010
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1s/step - accuracy: 0.7288 - loss: 1.0487 - val_accuracy: 0.7067 - val_loss: 1.1533 - learning_rate: 0.0010
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1s/step - accuracy: 0.7168 - loss: 0.9501 - val_accuracy: 0.6133 - val_loss: 1.3502 - learning_rate: 0.0010
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 933ms/step - accuracy: 0.7331 - loss: 0.7630 - val_accuracy: 0.7067 - val_loss: 1.1982 - learning_rate: 0.0010
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1s/step - accuracy: 0.7535 - loss: 0.8018 - val_accuracy: 0.7200 - val_loss: 1.1985 - learning_rate: 1.0000e-04
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1s/step -

In [None]:
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test accuracy: {test_acc}")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.7479 - loss: 1.0507
Test accuracy: 0.7066666483879089


In [None]:
# model = models.Sequential([
#     layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
#     layers.BatchNormalization(),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(64, (3, 3), activation='relu'),
#     layers.BatchNormalization(),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(128, (3, 3), activation='relu'),
#     layers.BatchNormalization(),
#     layers.MaxPooling2D((2, 2)),

#     layers.Flatten(),
#     layers.Dense(128, activation='relu'),
#     layers.BatchNormalization(),
#     layers.Dense(train_generator.num_classes, activation='softmax')
# ])


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input

# Define the base directory
b_dir = '/content/drive/My Drive/Bajaj-HackRX/datasets/Final_data'
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.3)  # Normalize images


# Training data generator
train_generator = train_datagen.flow_from_directory(
    b_dir,
    target_size=(224, 224),  # ResNet50 input size
    batch_size=32,
    class_mode='binary',     # Binary classification
    subset='training',
    shuffle=True)

# Validation data generator
validation_generator = train_datagen.flow_from_directory(
    b_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    subset='validation',
    shuffle=False)


Found 273 images belonging to 2 classes.
Found 116 images belonging to 2 classes.


In [None]:
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

# Create a new model on top
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  # For binary classification
])


In [None]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)


In [None]:
history = model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator
)


Epoch 1/10


  self._warn_if_super_not_called()


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 11s/step - accuracy: 0.9332 - loss: 0.3058 - precision: 0.9583 - recall: 0.9732 - val_accuracy: 0.9483 - val_loss: 0.2045 - val_precision: 0.9483 - val_recall: 1.0000
Epoch 2/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 973ms/step - accuracy: 0.9526 - loss: 0.2050 - precision: 0.9526 - recall: 1.0000 - val_accuracy: 0.9483 - val_loss: 0.2082 - val_precision: 0.9483 - val_recall: 1.0000
Epoch 3/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 968ms/step - accuracy: 0.9469 - loss: 0.2124 - precision: 0.9469 - recall: 1.0000 - val_accuracy: 0.9483 - val_loss: 0.2121 - val_precision: 0.9483 - val_recall: 1.0000
Epoch 4/10
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 981ms/step - accuracy: 0.9461 - loss: 0.2248 - precision: 0.9461 - recall: 1.0000 - val_accuracy: 0.9483 - val_loss: 0.2092 - val_precision: 0.9483 - val_recall: 1.0000
Epoch 5/10
[1m9/9[0m [32m━━━━━━━━━━━━

In [None]:
# Evaluate on validation data
val_loss, val_accuracy, val_precision, val_recall = model.evaluate(validation_generator)
print(f"Validation Accuracy: {val_accuracy}")
print(f"Validation Precision: {val_precision}")
print(f"Validation Recall: {val_recall}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step - accuracy: 0.9106 - loss: 0.3135 - precision: 0.9106 - recall: 1.0000
Validation Accuracy: 0.9482758641242981
Validation Precision: 0.9482758641242981
Validation Recall: 1.0
