In [6]:
from sklearn.datasets import load_iris
data = load_iris()

In [11]:
import pandas as pd
x_data = pd.DataFrame(data.data, columns = data.feature_names)
y_data = pd.DataFrame(data.target, columns=['Targets'])

In [12]:
x_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data, test_size = 0.2)

In [14]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [15]:
model.fit(x_train,y_train)

  return fit_method(estimator, *args, **kwargs)


In [16]:
y_pred = model.predict(x_test)

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9333333333333333

In [20]:
dump(model,'./../savedModels/models.joblib')

['./../savedModels/models.joblib']

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from PIL import Image, ImageChops, ImageEnhance
import piexif
import joblib
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage, Table, TableStyle
from reportlab.platypus.flowables import HRFlowable
from datetime import datetime
import matplotlib.pyplot as plt

# ====================== DATA DOWNLOAD AND PREPROCESSING ======================

def download_dataset():
    """Download the dataset from Kaggle."""
    path = kagglehub.dataset_download("adisharmaruda/doc-classifier")
    dataset_path = path  # Use the downloaded path
    print("✅ Dataset downloaded successfully!")
    return dataset_path

# Define categories
categories = {"aadhar-card": 0, "pan-card": 0, "fake-docs": 1}  # 0: Real, 1: Fake

# ====================== FEATURE EXTRACTION FUNCTIONS ======================

def extract_ela(image_path):
    """Extract Error Level Analysis (ELA) feature."""
    try:
        image = Image.open(image_path).convert("RGB")
        image.save("temp.jpg", "JPEG", quality=90)
        temp_image = Image.open("temp.jpg")
        ela_image = ImageChops.difference(image, temp_image)
        extrema = ela_image.getextrema()
        max_diff = max([ex[1] for ex in extrema])
        return max_diff
    except:
        return None

def extract_metadata(image_path):
    """Extract metadata from image."""
    try:
        exif_data = piexif.load(image_path)
        date_time = exif_data["0th"].get(piexif.ImageIFD.DateTime, b'').decode()
        return 1 if date_time else 0
    except:
        return 0

def extract_features(image_path):
    """Extract all features from an image."""
    img = cv2.imread(image_path)
    if img is None:
        return None

    img_resized = cv2.resize(img, (224, 224))  # Resize for consistency
    mean_r, mean_g, mean_b = np.mean(img[:, :, 0]), np.mean(img[:, :, 1]), np.mean(img[:, :, 2])
    std_r, std_g, std_b = np.std(img[:, :, 0]), np.std(img[:, :, 1]), np.std(img[:, :, 2])

    # Extract additional features
    ela_value = extract_ela(image_path)
    metadata_value = extract_metadata(image_path)

    # Flatten pixel values
    img_flatten = img_resized.flatten()[:1024]  # Use first 1024 pixels
    features = np.hstack([mean_r, std_r, mean_g, std_g, mean_b, std_b, ela_value, metadata_value, img_flatten])

    return features

# ====================== DATASET CREATION ======================

def create_dataset(dataset_path):
    """Create dataset from downloaded images."""
    data = []
    for category, label in categories.items():
        folder_path = os.path.join(dataset_path, category)
        if os.path.exists(folder_path):
            for filename in os.listdir(folder_path):
                image_path = os.path.join(folder_path, filename)
                features = extract_features(image_path)
                if features is not None:
                    data.append([image_path] + list(features) + [label])

    # Convert to DataFrame
    columns = ["image_path", "mean_r", "std_r", "mean_g", "std_g", "mean_b", "std_b", "ela_value", "metadata_value"] + [f"feat_{i}" for i in range(1024)] + ["label"]
    df = pd.DataFrame(data, columns=columns)

    # Split Data into Training & Testing
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Save datasets
    train_df.to_csv("train_dataset.csv", index=False)
    test_df.to_csv("test_dataset.csv", index=False)

    print("✅ Training & testing datasets created successfully!")
    return train_df, test_df

# ====================== DATA AUGMENTATION ======================

def generate_fake_data(df, num_samples=None):
    """Generate fake samples by adding noise to real data."""
    fake_data = df.copy()

    # Apply random noise to numerical features (excluding label column)
    feature_cols = fake_data.columns[1:-1]  # Exclude 'image_path' and 'label'

    for col in feature_cols:
        fake_data[col] += np.random.normal(0, 0.05, size=fake_data.shape[0])  # Add small noise

    # Change label to 'fake' (1)
    fake_data["label"] = 1

    # Limit number of generated fake samples if needed
    if num_samples:
        fake_data = fake_data.sample(n=num_samples, random_state=42, replace=True)

    return fake_data

def augment_data(train_df, test_df):
    """Augment the dataset with fake samples."""
    # Generate fake samples (same count as real ones)
    train_fake = generate_fake_data(train_df, num_samples=len(train_df))
    test_fake = generate_fake_data(test_df, num_samples=len(test_df))

    # Merge fake data with original datasets
    train_augmented = pd.concat([train_df, train_fake], axis=0, ignore_index=True)
    test_augmented = pd.concat([test_df, test_fake], axis=0, ignore_index=True)

    # Save updated datasets
    train_augmented.to_csv("train_dataset_augmented.csv", index=False)
    test_augmented.to_csv("test_dataset_augmented.csv", index=False)

    print(f"✅ Successfully added {len(train_fake)} fake samples to the training dataset!")
    print(f"✅ Successfully added {len(test_fake)} fake samples to the test dataset!")
    return train_augmented, test_augmented

# ====================== MODEL TRAINING ======================

def train_model(train_df, test_df):
    """Train the Random Forest classifier."""
    # Separate features and labels
    X_train = train_df.drop(columns=["image_path", "label"], errors='ignore')
    y_train = train_df["label"].astype('category').cat.codes  # Convert labels to numbers
    X_test = test_df.drop(columns=["image_path", "label"], errors='ignore')
    y_test = test_df["label"].astype('category').cat.codes

    # Train the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"🎯 Model Accuracy: {accuracy * 100:.2f}%")

    # Save trained model
    joblib.dump(model,'./../savedModels/train_models.joblib')
    print("✅ Model training completed!")
    return model

# ====================== FORGERY DETECTION ======================

def check_forgery(image_path, model, feature_columns):
    """Check if a document is forged using the trained model."""
    features = extract_features(image_path)  # Extract features from the image
    if features is None:
        return "⚠️ Error: Unable to extract features from image.", None

    # Convert to DataFrame and match feature columns
    feature_df = pd.DataFrame([features], columns=feature_columns)
    print(features)
    print(feature_df)

    # Predict using the trained model
    prediction_proba = model.predict_proba(feature_df)[0]
    print(f"Prediction Probabilities: {prediction_proba}")  # Debugging output

    result = "❌ Fake Document" if prediction_proba[1] > 0.45 else "✅ Real Document"
    return result, prediction_proba

# ====================== REPORT GENERATION ======================

def apply_ela(image_path, quality=90):
    """Apply Error Level Analysis (ELA) to detect forgery."""
    original = Image.open(image_path).convert('RGB')
    temp_path = "temp_compressed.jpg"
    original.save(temp_path, 'JPEG', quality=quality)
    compressed = Image.open(temp_path)

    ela_image = ImageChops.difference(original, compressed)
    extrema = ela_image.getextrema()
    max_diff = max([ex[1] for ex in extrema])
    scale = 255.0 / max_diff if max_diff else 1
    ela_image = ImageEnhance.Brightness(ela_image).enhance(scale)

    ela_path = "ela_output.png"
    ela_image.save(ela_path)
    return ela_path

def extract_metadata_for_report(image_path):
    """Extract metadata information from the image (if available)."""
    try:
        exif_data = Image.open(image_path)._getexif()
        if exif_data:
            metadata = {
                key: exif_data[key] for key in exif_data if key in [306, 271, 272]  # DateTime, Camera Make & Model
            }
            return metadata if metadata else 0  # Return 0 instead of "No Metadata Found"
        return 0  # If no metadata is found, return numerical 0
    except:
        return 0  # Ensure numeric return value

def generate_verification_report(image_path, prediction_proba, result):
    """Generate a professional PDF report for document verification with ELA, Metadata, and Charts."""
    report_name = "document_verification_report.pdf"
    doc = SimpleDocTemplate(report_name, pagesize=letter,
                           rightMargin=72, leftMargin=72,
                           topMargin=72, bottomMargin=18)

    # Determine if the document is fake based on the result string
    is_fake = "❌" in result #change
    status = "FORGED" if is_fake else "GENUINE"

    # Styles
    styles = getSampleStyleSheet()
    title_style = styles['Title']
    heading_style = styles['Heading1']
    normal_style = styles['Normal']

    # Custom styles
    section_style = ParagraphStyle(
        'Section',
        parent=styles['Heading2'],
        spaceAfter=12,
        textColor=colors.darkblue
    )

    # Build the document
    elements = []

    # Title
    elements.append(Paragraph("Document Verification Report", title_style))
    elements.append(Spacer(1, 0.25*inch))

    # Document Details
    elements.append(Paragraph("Document Details", section_style))
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.darkblue, spaceAfter=10))

    # Create a table for document details
    data = [
        ["Document Name:", os.path.basename(image_path)],
        ["Verification Date:", datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
        ["Forgery Status:", status],
        ["Detection Result:", result]  # Added the actual result with emoji
    ]

    # Add the original image
    elements.append(Spacer(1, 0.15*inch))
    elements.append(Paragraph("Original Document", section_style))

    # Include the original image with a maximum width of 5 inches
    img = Image.open(image_path)
    img_width, img_height = img.size
    aspect_ratio = img_height / img_width
    img_width = 5 * inch
    img_height = img_width * aspect_ratio

    img_path = "original_resized.jpg"
    img.save(img_path)
    elements.append(RLImage(img_path, width=img_width, height=img_height))
    elements.append(Spacer(1, 0.25*inch))

    # Format the table
    detail_table = Table(data, colWidths=[2*inch, 3.5*inch])
    detail_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
        ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
        ('ALIGN', (0, 0), (0, -1), 'RIGHT'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 10),
        ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
        ('TOPPADDING', (0, 0), (-1, -1), 8),
    ]))
    elements.append(detail_table)
    elements.append(Spacer(1, 0.25*inch))

    # Prediction Results
    elements.append(Paragraph("Prediction Results", section_style))
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.darkblue, spaceAfter=10))

    # Generate Pie Chart
    labels = ["Real", "Fake"]
    plt.figure(figsize=(5, 5))
    plt.pie(prediction_proba, labels=labels, autopct='%1.1f%%',
            colors=['#FF6B6B', '#4ECDC4'], startangle=140,
            wedgeprops={'edgecolor': 'white', 'linewidth': 2})
    plt.title("Forgery Prediction Confidence")
    pie_chart_path = "prediction_pie_chart.png"
    plt.savefig(pie_chart_path, bbox_inches='tight', dpi=150)
    plt.close()

    # Add explanation about the threshold
    elements.append(Paragraph(f"Note: According to your model, a document is considered fake if the Real probability is greater than 40%.", normal_style))
    elements.append(Spacer(1, 0.15*inch))

    # Include the pie chart
    elements.append(RLImage(pie_chart_path, width=4*inch, height=4*inch))
    elements.append(Spacer(1, 0.15*inch))

    # Create a table for prediction probabilities
    prob_data = [
        ["Prediction", "Confidence"],
        ["Real", f"{prediction_proba[0]:.2%}"],
        ["Fake", f"{prediction_proba[1]:.2%}"]
    ]

    prob_table = Table(prob_data, colWidths=[2*inch, 2*inch])
    prob_table.setStyle(TableStyle([
        ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
        ('FONTSIZE', (0, 0), (-1, -1), 10),
        ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
        ('TOPPADDING', (0, 0), (-1, -1), 8),
    ]))
    elements.append(prob_table)
    elements.append(Spacer(1, 0.25*inch))

    # Error Level Analysis
    elements.append(Paragraph("Error Level Analysis (ELA)", section_style))
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.darkblue, spaceAfter=10))
    elements.append(Paragraph("ELA highlights differences in compression levels. Areas with higher error levels may indicate manipulation.", normal_style))
    elements.append(Spacer(1, 0.15*inch))

    # Apply ELA and add to report
    ela_path = apply_ela(image_path)
    elements.append(RLImage(ela_path, width=5*inch, height=3*inch))
    elements.append(Spacer(1, 0.25*inch))

    # Extract Metadata
    elements.append(Paragraph("Metadata Information", section_style))
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.darkblue, spaceAfter=10))

    metadata = extract_metadata_for_report(image_path)
    if isinstance(metadata, dict) and metadata:
        metadata_rows = []
        for key, value in metadata.items():
            metadata_rows.append([str(key), str(value)])

        metadata_table = Table(metadata_rows, colWidths=[2*inch, 3.5*inch])
        metadata_table.setStyle(TableStyle([
            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey),
            ('BACKGROUND', (0, 0), (0, -1), colors.lightgrey),
            ('ALIGN', (0, 0), (0, -1), 'RIGHT'),
            ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
            ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
            ('FONTSIZE', (0, 0), (-1, -1), 10),
            ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
            ('TOPPADDING', (0, 0), (-1, -1), 8),
        ]))
        elements.append(metadata_table)
    else:
        elements.append(Paragraph("No metadata found in the image.", normal_style))

    # Conclusion
    elements.append(Spacer(1, 0.25*inch))
    elements.append(Paragraph("Conclusion", section_style))
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.darkblue, spaceAfter=10))

    conclusion_text = f"Based on our analysis, this document appears to be {status}."
    elements.append(Paragraph(conclusion_text, normal_style))

    if is_fake:
        elements.append(Paragraph("The document shows signs of digital manipulation. Please review the ELA analysis and prediction probabilities for more details.", normal_style))
    else:
        elements.append(Paragraph("No significant signs of digital manipulation were detected. However, this analysis is not conclusive and should be combined with other verification methods.", normal_style))

    # Add disclaimer
    elements.append(Spacer(1, 0.25*inch))
    elements.append(Paragraph("Disclaimer", section_style))
    elements.append(HRFlowable(width="100%", thickness=1, color=colors.darkblue, spaceAfter=10))
    disclaimer_text = "This report is generated by an automated system and should be used for informational purposes only. The results are based on digital analysis and may not be 100% accurate. For legal or critical verification, please consult with a forensic document examiner."
    elements.append(Paragraph(disclaimer_text, normal_style))

    # Build the document
    doc.build(elements)
    print(f"✅ Report saved as {report_name}")
    return report_name

# ====================== MAIN EXECUTION ======================

def main():
    # Step 1: Download dataset
    dataset_path = download_dataset()

    # Step 2: Create and split dataset
    train_df, test_df = create_dataset(dataset_path)

    # Step 3: Augment data with fake samples
    train_augmented, test_augmented = augment_data(train_df, test_df)

    # Step 4: Train model
    model = train_model(train_augmented, test_augmented)

    # Step 5: Test with a sample image
    test_image_path = "./adh.jpg"  

    # Load feature columns (needed for prediction)
    X_train = train_augmented.drop(columns=["image_path", "label"], errors='ignore')

    # Check forgery
    result, prediction_proba = check_forgery(test_image_path, model, X_train.columns)
    print("thissssss",result)
    
    # Generate report
    generate_verification_report(test_image_path, prediction_proba, result)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


✅ Dataset downloaded successfully!
✅ Training & testing datasets created successfully!
✅ Successfully added 50 fake samples to the training dataset!
✅ Successfully added 13 fake samples to the test dataset!
🎯 Model Accuracy: 57.69%
✅ Model training completed!
[171.88124546  42.03423182 171.49151429 ...  78.          79.
  84.        ]
       mean_r      std_r      mean_g      std_g      mean_b      std_b  \
0  171.881245  42.034232  171.491514  42.280405  172.210583  41.443551   

   ela_value  metadata_value  feat_0  feat_1  ...  feat_1014  feat_1015  \
0       12.0             0.0    79.0    73.0  ...       85.0       77.0   

   feat_1016  feat_1017  feat_1018  feat_1019  feat_1020  feat_1021  \
0       78.0       81.0       73.0       74.0       86.0       78.0   

   feat_1022  feat_1023  
0       79.0       84.0  

[1 rows x 1032 columns]
Prediction Probabilities: [0.64 0.36]
thissssss ✅ Real Document
✅ Report saved as document_verification_report.pdf


In [1]:
!pip install pandas spacy scikit-learn imbalanced-learn xgboost joblib easyocr


^C
Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: still running...
  Installing build dependencies: still running...
  Installing build dependencies: still running...
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  pip subprocess to install build dependencies did not run successfully.
  exit code: 1
  
  [1100 lines of output]
  Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
  Collecting setuptools
    Using cached setuptools-79.0.0-py3-none-any.whl.metadata (6.5 kB)
  Collecting cython<3.0,>=0.25
    Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
  Collecting cymem<2.1.0,>=2.0.2
    Using cached cymem-2.0.11-cp313-cp313-win_amd64.whl.metadata (8.8 kB)
  Collecting preshed<3.1.0,>=3.0.2
    Using cached preshed-3.0.9.tar.gz (14 kB)
    Installing build dependencies: started
    Installing build dependencies: finished with status 'done'
    Getting requirements to build wheel: started
    Getting requirements to build wheel: finished with status 'done'
    Preparing metadata (pyproject.toml): started
    Preparing metadata (pyproject.toml): finished with status 'done'
  Collecting murmurhash<1.1.0,>=0.28.