### Extraction and verification of information from semi-categorized data. 

In [4]:
import os
import pikepdf
import PyPDF2
import tkinter as tk
from tkinter import filedialog, messagebox, scrolledtext
from pdf2image import convert_from_path
import re
import cv2
import numpy as np

# ✅ Try importing TensorFlow (optional)
USE_CNN = True
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
except ImportError:
    USE_CNN = False
    print("⚠ TensorFlow not installed! CNN training will be skipped.")

# ✅ Function to unlock a password-protected PDF
def unlock_pdf(input_pdf_path, password):
    output_pdf_path = input_pdf_path.replace('.pdf', '_unlocked.pdf')
    try:
        with pikepdf.Pdf.open(input_pdf_path, password=password) as pdf:
            pdf.save(output_pdf_path)
        return output_pdf_path
    except Exception as e:
        messagebox.showerror("Error", f"Error unlocking PDF: {e}")
        return None

# ✅ Function to extract text from Aadhaar PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        messagebox.showerror("Error", f"Error extracting text: {e}")
    return text

# ✅ Function to remove Gujarati text (only keep English characters)
def filter_text_english_only(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)  # Removes non-ASCII characters

# ✅ Function to convert PDF to images
def pdf_to_images(pdf_path, output_folder="aadhaar_images"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    images = convert_from_path(pdf_path)
    image_paths = []
    
    for i, img in enumerate(images):
        img_path = os.path.join(output_folder, f"aadhaar_page_{i+1}.png")
        img.save(img_path, "PNG")
        image_paths.append(img_path)

    return image_paths

# ✅ Function to preprocess images for CNN
def preprocess_images(image_paths):
    processed_images = []
    
    for img_path in image_paths:
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (128, 128))  # Resize for CNN
        img = img / 255.0  # Normalize pixel values
        processed_images.append(img)

    return np.array(processed_images).reshape(-1, 128, 128, 1)  # Reshape for CNN

# ✅ Function to delete temporary unlocked Aadhaar PDF
def delete_temp_pdf(pdf_path):
    try:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)
            print(f"Deleted temporary file: {pdf_path}")
    except Exception as e:
        print(f"Error deleting temporary file: {e}")

# ✅ Function to build a CNN model (Only if TensorFlow is installed)
def build_cnn_model():
    if not USE_CNN:
        return None

    model = keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(2, activation='softmax')  # 2 classes (Valid/Invalid Aadhaar)
    ])
    
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model
# ✅ GUI Setup
root = tk.Tk()
root.title("Aadhaar PDF Processor")
root.geometry("600x500")

tk.Label(root, text="Enter PDF Password (if required):").pack(pady=5)
password_entry = tk.Entry(root, show='*', width=30)
password_entry.pack(pady=5)

text_area = scrolledtext.ScrolledText(root, wrap=tk.WORD, width=70, height=10)
text_area.pack(pady=10)

def upload_pdf():
    file_path = filedialog.askopenfilename(filetypes=[("PDF files", "*.pdf")])
    if not file_path:
        messagebox.showwarning("Warning", "No file selected!")
        return

    password = password_entry.get()
    unlocked_pdf_path = unlock_pdf(file_path, password) if password else file_path

    extracted_text = extract_text_from_pdf(unlocked_pdf_path)
    extracted_text = filter_text_english_only(extracted_text)

    if extracted_text:
        text_area.delete(1.0, tk.END)
        text_area.insert(tk.END, "Extracted Text:\n")
        text_area.insert(tk.END, extracted_text + "\n")
        messagebox.showinfo("Success", "Text extracted successfully!")

        # ✅ Convert PDF to images
        image_paths = pdf_to_images(unlocked_pdf_path)
        
        # ✅ Preprocess images for CNN
        cnn_input_data = preprocess_images(image_paths)

        # ✅ If TensorFlow is installed, train CNN
        if USE_CNN:
            model = build_cnn_model()
            if model:
                model.fit(cnn_input_data, np.zeros(len(cnn_input_data)), epochs=3)  # Dummy labels for now
                messagebox.showinfo("Success", "CNN Model Trained Successfully!")
        else:
            print("⚠ Skipping CNN Training (TensorFlow not installed).")

        # ✅ Delete temporary files
        delete_temp_pdf(unlocked_pdf_path)

    else:
        messagebox.showerror("Error", "No text could be extracted from the PDF.")

upload_button = tk.Button(root, text="Upload Aadhaar PDF", command=upload_pdf)
upload_button.pack(pady=10)

root.mainloop()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 1.0000 - loss: 0.5381
Epoch 2/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 1.0000 - loss: 1.7355e-04
Epoch 3/3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step - accuracy: 1.0000 - loss: 0.0000e+00
Deleted temporary file: C:/Users/DC/Downloads/EAadhaar_unlocked.pdf


In [9]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from tkinter import filedialog, Tk

# Path to your local image dataset
image_folder = r"C:/Users/DC/Downloads/archive (7)"

# Load images
image_data = []
labels = []  # We'll simulate labels: 0 or 1

print("Loading images...")
for idx, filename in enumerate(os.listdir(image_folder)):
    if filename.endswith((".jpg", ".jpeg", ".png")):
        img_path = os.path.join(image_folder, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is not None:
            img = cv2.resize(img, (128, 128))  # Resize to common size
            img = img.flatten()  # Flatten for SVM input
            image_data.append(img)

            # Simulated labels: alternate 0 and 1 for now
            labels.append(idx % 2)

image_data = np.array(image_data)
labels = np.array(labels)

# Check if images loaded
print(f"Loaded {len(image_data)} images.")
if len(image_data) == 0:
    raise ValueError("No images found in the folder. Check the path!")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    image_data, labels, test_size=0.3, random_state=42
)

# Train + Hyperparameter tuning
print("Training SVM with hyperparameter tuning...")
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svc = svm.SVC()
grid = GridSearchCV(svc, param_grid, cv=3)
grid.fit(X_train, y_train)

print(f"Best Parameters: {grid.best_params_}")
print("\nClassification Report:\n", classification_report(y_test, grid.predict(X_test)))


# =====================
# Predict New Image (GUI File Picker)
# =====================
def predict_new_image():
    root = Tk()
    root.withdraw()  # Hide the GUI
    file_path = filedialog.askopenfilename(
        filetypes=[("Image files", "*.jpg;*.jpeg;*.png")]
    )
    if file_path:
        test_img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
        test_img = cv2.resize(test_img, (128, 128)).flatten().reshape(1, -1)
        prediction = grid.predict(test_img)[0]
        print(f"✅ Prediction for uploaded image: {prediction} (label simulated)")
    else:
        print("No image selected.")


# Run prediction
predict_new_image()

valid = 0
invalid = 0

for filename in os.listdir(dataset_dir):
    if "valid" in filename.lower():
        valid += 1
    else:
        invalid += 1

print(f"Valid images: {valid}")
print(f"Invalid images: {invalid}")

import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
import joblib

# Path to dataset
dataset_dir = os.path.expanduser("~/Downloads/archive (7)")

X = []
y = []

# Assuming naming convention like: valid_1.jpg / invalid_2.jpg
for filename in os.listdir(dataset_dir):
    file_path = os.path.join(dataset_dir, filename)
    img = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        continue
    img = cv2.resize(img, (128, 128))
    img = img / 255.0
    X.append(img.flatten())
    
    if "valid" in filename.lower():
        y.append(1)
    else:
        y.append(0)

X = np.array(X)
y = np.array(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train & hyper-tune SVM
params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), params, cv=3)
grid.fit(X_train, y_train)

# Save model
joblib.dump(grid.best_estimator_, "svm_model.joblib")
print("✅ SVM model trained and saved as 'svm_model.joblib'")


Loading images...
Loaded 21 images.
Training SVM with hyperparameter tuning...
Best Parameters: {'C': 0.1, 'kernel': 'linear'}

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.75      0.60      0.67         5

    accuracy                           0.57         7
   macro avg       0.54      0.55      0.53         7
weighted avg       0.63      0.57      0.59         7

✅ Prediction for uploaded image: 0 (label simulated)
Valid images: 0
Invalid images: 21


ValueError: 
All the 18 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\svm\_base.py", line 207, in fit
    y = self._validate_targets(y)
        ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DC\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\svm\_base.py", line 751, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class


In [6]:
import os
import cv2
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
from joblib import dump
import warnings
warnings.filterwarnings('ignore')

# ✅ Path to image folder
dataset_path = os.path.expanduser("~/Downloads/archive (7)")

def load_images_with_simulated_invalid(folder):
    valid_images, invalid_images, labels = [], [], []
    for filename in os.listdir(folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            path = os.path.join(folder, filename)
            img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
            if img is None:
                continue
            img = cv2.resize(img, (128, 128))
            img_flat = img.flatten() / 255.0
            valid_images.append(img_flat)
            labels.append(1)  # valid

            # Simulate invalid image by flipping
            invalid_img = cv2.flip(img, 1)
            invalid_flat = invalid_img.flatten() / 255.0
            invalid_images.append(invalid_flat)
            labels.append(0)  # simulated invalid

    all_images = np.array(valid_images + invalid_images)
    all_labels = np.array(labels)
    return all_images, all_labels

# ✅ Load with simulated invalids
X, y = load_images_with_simulated_invalid(dataset_path)

# ✅ Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# ✅ Train with GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}
svc = svm.SVC()
grid = GridSearchCV(svc, param_grid, cv=3)
grid.fit(X_train, y_train)

# ✅ Save model
dump(grid, "svm_model.joblib")

# ✅ Predict and report
y_pred = grid.predict(X_test)
report = classification_report(y_test, y_pred)

# ✅ Output training summary
print(f"🧠 Model trained on {len(X_train)} samples")
print(f"🧪 Model tested on {len(X_test)} samples")
print(f"🏆 Best Parameters: {grid.best_params_}")
print("\n📊 Classification Report:\n", report)


🧠 Model trained on 29 samples
🧪 Model tested on 13 samples
🏆 Best Parameters: {'C': 1, 'kernel': 'rbf'}

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.57      0.53         7
           1       0.40      0.33      0.36         6

    accuracy                           0.46        13
   macro avg       0.45      0.45      0.45        13
weighted avg       0.45      0.46      0.46        13



In [None]:
import tkinter as tk
from tkinter import filedialog, messagebox
from PIL import Image, ImageTk
import cv2
import numpy as np
from joblib import load

# Load trained model
model = load("svm_model.joblib")

# Image preprocessing function
def preprocess_image(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (128, 128))
    img_flat = img.flatten() / 255.0
    return img_flat.reshape(1, -1)

# Prediction function
def predict_image():
    filepath = filedialog.askopenfilename(
        filetypes=[("Image Files", "*.jpg *.jpeg *.png")]
    )
    if not filepath:
        return

    img_array = preprocess_image(filepath)
    prediction = model.predict(img_array)[0]

    # Show image and prediction
    img = Image.open(filepath)
    img = img.resize((250, 250))
    img_tk = ImageTk.PhotoImage(img)
    image_label.configure(image=img_tk)
    image_label.image = img_tk

    # Display result
    if prediction == 1:
        result_label.config(text="✅ This is a VALID Aadhaar card", fg="green")
    else:
        result_label.config(text="❌ This is an INVALID Aadhaar card", fg="red")

# Tkinter GUI setup
root = tk.Tk()
root.title("Aadhaar Card Validator")
root.geometry("400x400")
root.configure(bg="white")

title_label = tk.Label(root, text="Aadhaar Card Validator", font=("Arial", 16, "bold"), bg="white")
title_label.pack(pady=10)

image_label = tk.Label(root, bg="white")
image_label.pack()

upload_btn = tk.Button(root, text="Upload Aadhaar Image", command=predict_image, font=("Arial", 12))
upload_btn.pack(pady=10)

result_label = tk.Label(root, text="", font=("Arial", 14), bg="white")
result_label.pack(pady=10)

root.mainloop()
