In [2]:
import cv2
import os
import numpy as np
import pytesseract
import re
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

base_dir = r"G:\fraud_document_ai\data\processed\thresholded"

X = []
y = []

def extract_features(img):
    edges = cv2.Canny(img, 100, 200)
    edge_density = np.sum(edges > 0) / edges.size

    contours, _ = cv2.findContours(
        img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )
    contour_count = len(contours)

    text = pytesseract.image_to_string(img)
    has_total = 1 if "Total" in text else 0
    amount_count = len(re.findall(r"\d+[.,]\d+", text))

    return [edge_density, contour_count, has_total, amount_count]

for label, class_id in [("genuine", 0), ("fraud", 1)]:
    folder = os.path.join(base_dir, label)
    for file in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, file), cv2.IMREAD_GRAYSCALE)
        feats = extract_features(img)
        X.append(feats)
        y.append(class_id)

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

model = LogisticRegression(class_weight="balanced")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.62      0.69        16
           1       0.25      0.40      0.31         5

    accuracy                           0.57        21
   macro avg       0.51      0.51      0.50        21
weighted avg       0.65      0.57      0.60        21



**The system fuses textual inconsistencies from OCR with visual anomaly features to classify fraudulent documents.**

CLASSIFICATION REPORT TERMS

0 → Genuine documents  
1 → Fraud documents


PRECISION
What it means:

• Out of everything the model predicted as this class, how many were correct

Example:

Precision = 0.77 for class 0

• When the model says "Genuine", it is correct 77% of the time

Why it matters:

• Low precision → many false alarms

RECALL

What it means:

• Out of all actual documents of this class, how many were correctly found

Example:
Recall = 0.40 for class 1

• Out of all real fraud documents, 40% were detected

Why it matters:

• Low recall → fraud is being missed (bad)

F1-SCORE

What it means:

• Single score combining precision and recall
• Balance between false alarms and missed cases

Formula:

F1 = 2 × (Precision × Recall) / (Precision + Recall)

Why it matters:

• Useful when data is imbalanced (like fraud detection)

SUPPORT

What it means:
• Number of true samples of that class in the test set

Example:
Support = 5 for class 1

• There were 5 fraud documents in the test data

Why it matters:
    
• Small support → metrics can fluctuate


HOW TO READ YOUR OUTPUT

Class 0 (Genuine):

• Precision 0.77 → genuine predictions are mostly correct
• Recall 0.62 → some genuine docs flagged as fraud (acceptable)

Class 1 (Fraud):

• Precision 0.25 → some false alarms exist
• Recall 0.40 → 2 out of 5 frauds detected (success)

Final meaning:

• Model detects fraud
• Accepts some mistakes on genuine docs
• Correct behavior for fraud detection systems
