In [None]:
import fitz
import os

def convert_pdfs_to_images(pdf_folder, output_folder):
    """
    Converts all PDF files in the given folder to PNG images.

    Args:
        pdf_folder (str): The path to the folder containing the PDF files.
        output_folder (str): The path to the folder where the images will be saved.
    """
    os.makedirs(output_folder, exist_ok=True)
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            try:
                doc = fitz.open(pdf_path)
                page = doc.load_page(0)  # Convert the first page
                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Increase resolution for better quality
                img_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.png")
                pix.save(img_path)
                doc.close()
            except Exception as e:
                print(f"Error processing {filename}: {e}")

# Define the input and output folders
pdf_certificates_folder = "/content/certificates_pdf"
image_certificates_folder = "/content/output_folder"

# Create the input folder if it doesn't exist (for testing purposes)
os.makedirs(pdf_certificates_folder, exist_ok=True)

# Create a dummy PDF file for testing (optional)


# Call the function with the correct folder paths
convert_pdfs_to_images(pdf_certificates_folder, image_certificates_folder)

print(f"PDFs in '{pdf_certificates_folder}' converted to images in '{image_certificates_folder}'")

PDFs in '/content/certificates_pdf' converted to images in '/content/output_folder'


In [None]:
import shutil
import os
import random

def organize_dataset(img_folder, dest_folder, train_ratio=0.8):
    real = [f for f in os.listdir(img_folder) if f.startswith('r_')]
    fake = [f for f in os.listdir(img_folder) if f.startswith('fk_')]

    def move_files(file_list, label):
        split = int(train_ratio * len(file_list))
        train_files = file_list[:split]
        val_files = file_list[split:]

        for f in train_files:
            shutil.copy(os.path.join(img_folder, f), os.path.join(dest_folder, 'train', label, f))
        for f in val_files:
            shutil.copy(os.path.join(img_folder, f), os.path.join(dest_folder, 'val', label, f))

    os.makedirs(os.path.join(dest_folder, 'train', 'real'), exist_ok=True)
    os.makedirs(os.path.join(dest_folder, 'train', 'fake'), exist_ok=True)
    os.makedirs(os.path.join(dest_folder, 'val', 'real'), exist_ok=True)
    os.makedirs(os.path.join(dest_folder, 'val', 'fake'), exist_ok=True)

    random.shuffle(real)
    random.shuffle(fake)
    move_files(real, 'real')
    move_files(fake, 'fake')

organize_dataset("/content/output_folder", "dataset")


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models

IMAGE_SIZE = (224, 224)

train_dir = 'dataset/train'
val_dir = 'dataset/val'

train_datagen = ImageDataGenerator(rescale=1./255, rotation_range=10, zoom_range=0.1)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    train_dir, target_size=IMAGE_SIZE, batch_size=32, class_mode='binary'
)
val_generator = val_datagen.flow_from_directory(
    val_dir, target_size=IMAGE_SIZE, batch_size=32, class_mode='binary'
)

base_model = MobileNetV2(input_shape=IMAGE_SIZE + (3,), include_top=False, weights='imagenet')
base_model.trainable = False  # Freeze base

model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

model.fit(train_generator, validation_data=val_generator, epochs=10)


Found 799 images belonging to 2 classes.
Found 201 images belonging to 2 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 3s/step - accuracy: 0.8123 - loss: 0.4511 - val_accuracy: 0.8209 - val_loss: 0.1993
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 3s/step - accuracy: 0.9580 - loss: 0.1442 - val_accuracy: 0.9950 - val_loss: 0.0552
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/step - accuracy: 0.9790 - loss: 0.0684 - val_accuracy: 0.9950 - val_loss: 0.0328
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 3s/step - accuracy: 0.9845 - loss: 0.0468 - val_accuracy: 1.0000 - val_loss: 0.0342
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 2s/step - accuracy: 0.9936 - loss: 0.0367 - val_accuracy: 0.9950 - val_loss: 0.0182
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2s/step - accuracy: 0.9960 - loss: 0.0252 - val_accuracy: 1.0000 - val_loss: 0.0125
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7dc99f376bd0>

MAIN CODE



In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models

IMAGE_SIZE = (224, 224)

train_dir = 'dataset/train'
val_dir = 'dataset/val'

train_datagen = ImageDataGenerator(rescale=1./255, rotation_range=10, zoom_range=0.1)
val_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    train_dir, target_size=IMAGE_SIZE, batch_size=32, class_mode='binary'
)
val_generator = val_datagen.flow_from_directory(
    val_dir, target_size=IMAGE_SIZE, batch_size=32, class_mode='binary'
)

base_model = MobileNetV2(input_shape=IMAGE_SIZE + (3,), include_top=False, weights='imagenet')
base_model.trainable = False  # Freeze base

model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

model.fit(train_generator, validation_data=val_generator, epochs=10)



Found 852 images belonging to 2 classes.
Found 227 images belonging to 2 classes.


Epoch 1/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3s/step - accuracy: 0.7912 - loss: 0.5297 - val_accuracy: 0.8722 - val_loss: 0.2702
Epoch 2/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 2s/step - accuracy: 0.9266 - loss: 0.2197 - val_accuracy: 0.9207 - val_loss: 0.2013
Epoch 3/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 2s/step - accuracy: 0.9466 - loss: 0.1440 - val_accuracy: 0.9471 - val_loss: 0.1469
Epoch 4/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.9539 - loss: 0.1235 - val_accuracy: 0.9207 - val_loss: 0.1426
Epoch 5/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3s/step - accuracy: 0.9530 - loss: 0.1028 - val_accuracy: 0.9648 - val_loss: 0.0897
Epoch 6/10
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 2s/step - accuracy: 0.9402 - loss: 0.1272 - val_accuracy: 0.9427 - val_loss: 0.1009
Epoch 7/10
[1m27/27[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7dc98dfee650>

In [None]:
# ------------------------
# STEP 2: Install Python Packages
# ------------------------
!pip install flask-ngrok
!pip install pdf2image
!apt-get install poppler-utils
!pip install tensorflow pillow

# ------------------------
# STEP 3: Load model
# ------------------------
from tensorflow.keras.models import load_model
model = load_model("/content/trained_model.h5")  # Make sure your model is uploaded to Colab
# ------------------------
# STEP 4: Create API
# ------------------------
import numpy as np
from flask import Flask, request, jsonify
from pdf2image import convert_from_bytes
from PIL import Image

app = Flask(__name__)

@app.route("/", methods=["GET"])
def home():
    return "✅ API is running. Send POST to /predict with PDF file."

@app.route("/predict", methods=["POST"])
def predict():
    if 'file' not in request.files:
        return jsonify({"error": "No file uploaded"}), 400

    file = request.files['file']
    if not file.filename.endswith(".pdf"):
        return jsonify({"error": "Only PDF files supported"}), 400

    try:
        # Convert PDF to image
        images = convert_from_bytes(file.read())
        image = images[0]  # Use first page only

        # Resize and normalize image
        img = image.resize((224, 224))
        img_array = np.array(img) / 255.0
        img_array = np.expand_dims(img_array, axis=0)

        # Predict
        prediction = model.predict(img_array)
        score = float(prediction[0][0])

        return jsonify({
            "prediction_score": score,
            "is_valid": score >= 0.5
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500
# ------------------------
# STEP 5: Start ngrok + Flask
# ------------------------

import subprocess
import time
import requests

# Kill old ngrok (if any)
!pkill -f ngrok

# Start new ngrok tunnel
ngrok_process = subprocess.Popen(['ngrok', 'http', '5000'])
time.sleep(3)

# Get public URL
public_url = requests.get("http://localhost:4040/api/tunnels").json()['tunnels'][0]['public_url']
print(f"🔗 Public API URL: {public_url}")

# Run Flask app
app.run(port=5000)


Collecting flask-ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl.metadata (1.8 kB)
Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.7 [186 kB]
Fetched 186 kB in 1s (230 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 1



🔗 Public API URL: https://e3a1-34-121-71-112.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [None]:
# 🔧 Remove old ngrok and install latest v3.x
!rm -f /usr/local/bin/ngrok
!wget -q -O ngrok.zip https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.zip
!unzip -o ngrok.zip
!mv ngrok /usr/local/bin


Archive:  ngrok.zip
  inflating: ngrok                   


In [None]:
# 🔑 Replace with your own authtoken from: https://dashboard.ngrok.com/get-started/setup
!ngrok config add-authtoken 2vfjMNYZqQF4FCKcnMLo9rdPZz6_2BmURzGwJxa6XHSSsTMKB


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
# prompt: Download the trained model

# Assuming the model is named 'model' as in the provided code.
model.save('trained_model.h5')
from google.colab import files
files.download('trained_model.h5')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Final accuracy after introducing Noise in the dataset is 96.88%

In [None]:
!pip install fitz


Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.2.0-py3-none-any.whl.metadata (5.5 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting prov>=1.5.2 (from nipype->fitz)
  Downloading prov-2.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting rdflib>=5.0.0 (from nipype->fitz)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Collecting traits>=6.2 (from nipype->fitz)
  Downloading traits-7.0.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Collecting acres (from nipype->fitz)
  Downloading acres-0.3.0-py3-none-any.whl.metadata (5.5 kB)
Collecting etelemetry>=0.3.1

In [None]:
# prompt: give me a code to test my model  my certificates are in pdf this is my image path /content/fk_cert189.pdf  /content/r_cert811.pdf do feed model the png

import fitz
import os
import shutil
import random
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, models
from tensorflow.keras.models import load_model
import numpy as np
from PIL import Image

# ... (Your existing code for PDF to image conversion and dataset organization) ...


# Load the trained model
model = load_model('trained_model.h5')

def predict_certificate(image_path):
    img = Image.open(image_path).resize((224, 224))
    img_array = np.array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension

    prediction = model.predict(img_array)
    probability = prediction[0][0]

    if probability > 0.5:
        return "Real Certificate", probability
    else:
        return "Fake Certificate", probability

# Example usage
pdf_files = ["/content/fk_cert189.pdf", "/content/r_cert811.pdf"]

for pdf_file in pdf_files:
    # Convert PDF to PNG
    try:
      doc = fitz.open(pdf_file)
      page = doc.load_page(0)
      pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
      img_path = f"{os.path.splitext(pdf_file)[0]}.png"
      pix.save(img_path)
      doc.close()
      # Make prediction
      result, probability = predict_certificate(img_path)
      print(f"File: {pdf_file}, Prediction: {result}, Probability: {probability}")
    except Exception as e:
      print(f"Error processing {pdf_file}: {e}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
File: /content/fk_cert189.pdf, Prediction: Fake Certificate, Probability: 0.0404382087290287
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
File: /content/r_cert811.pdf, Prediction: Real Certificate, Probability: 0.9988749027252197


In [None]:
!pip uninstall fitz  # removes the wrong fitz if installed
!pip install --upgrade pip
!pip install pymupdf


Found existing installation: fitz 0.0.1.dev2
Uninstalling fitz-0.0.1.dev2:
  Would remove:
    /usr/local/bin/fitz
    /usr/local/bin/log2design.py
    /usr/local/lib/python3.11/dist-packages/.DS_Store
    /usr/local/lib/python3.11/dist-packages/fitz-0.0.1.dev2.dist-info/*
    /usr/local/lib/python3.11/dist-packages/fitz/*
    /usr/local/lib/python3.11/dist-packages/scripts/*
  Would not remove (might be manually added):
    /usr/local/lib/python3.11/dist-packages/scripts/readme-gen/readme_gen.py
Proceed (Y/n)? y
Y
  Successfully uninstalled fitz-0.0.1.dev2
Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfull

In [None]:
import pandas as pd

# Creating a simplified, grouped configuration table
simple_config = {
    "Category": [
        "Base Model", "Input Shape", "Freezing",
        "Additional Layers", "Output Layer",
        "Loss Function", "Optimizer", "Metric",
        "Augmentation (Train)", "Preprocessing",
        "Batch Size", "Epochs", "Class Mode",
        "Training Directory", "Validation Directory"
    ],
    "Configuration": [
        "MobileNetV2 (pretrained on ImageNet)", "(224, 224, 3)", "Base model frozen",
        "GlobalAveragePooling2D → Dense(64, relu) → Dropout(0.3)", "Dense(1, sigmoid)",
        "Binary Crossentropy", "Adam", "Accuracy",
        "Rotation=10°, Zoom=0.1", "Rescale = 1./255",
        "32", "10", "Binary",
        "dataset/train", "dataset/val"
    ]
}

df_simple_config = pd.DataFrame(simple_config)

# Save to CSV
csv_simple_path = "/content/data.csv"
df_simple_config.to_csv(csv_simple_path, index=False)

csv_simple_path


'/content/data.csv'