In [None]:
import os
import re
import tempfile
from flask import Flask, render_template, request, redirect, url_for, send_from_directory
from werkzeug.utils import secure_filename
import cv2
import easyocr
from pdf2image import convert_from_path
from PIL import Image

# Flask app setup
app = Flask(__name__)

# Configurations
UPLOAD_FOLDER = "uploads"
ALLOWED_EXTENSIONS = {'pdf', 'jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff'}
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER

# Ensure the upload directory exists
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# Poppler path (modify according to your OS)
poppler_path = r"C:\Program Files\poppler-24.08.0\Library\bin"

# Initialize OCR reader
reader = easyocr.Reader(['en'])

# List of valid country codes
country_codes = [
    "AFG", "ALA", "ALB", "DZA", "AND", "AGO", "ARG", "AUS", "AUT", "BGD", "BEL", "BOL",
    "BRA", "CAN", "CHN", "COL", "EGY", "FRA", "DEU", "GHA", "IND", "IDN", "ITA", "JPN",
    "KEN", "MEX", "NGA", "NOR", "PAK", "PHL", "POL", "PRT", "RUS", "SAU", "SGP", "ZAF",
    "ESP", "LKA", "SWE", "CHE", "THA", "TUR", "UKR", "ARE", "GBR", "USA", "VNM", "ZWE"
]

# Keywords for name extraction
keywords = ['surname', 'first name', 'given names', 'full name']

# Unwanted words to be removed
unwanted_words = ['first', 'names', 'surname', 'name', 'full']


def allowed_file(filename):
    """Check if the uploaded file is allowed."""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


def preprocess_image(image_path):
    """Preprocess the image for better OCR results."""
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return thresh


def extract_text(image_path):
    """Extract text from the image using EasyOCR."""
    image = preprocess_image(image_path)
    result = reader.readtext(image, detail=0)
    return "\n".join(result)


def parse_mrz_lines(text):
    """Extract MRZ lines that start with 'P' and contain '<<'."""
    lines = text.splitlines()
    return [line for line in lines if line.startswith('P') and '<<' in line]


def split_mrz_line(mrz_line):
    """Process the MRZ line to extract document details."""
    mrz_line = mrz_line.replace("*", "").replace("-", "")
    doc_type = mrz_line[:2]
    country_code = mrz_line[2:5]
    given_name = mrz_line[5:]
    country_exists = country_code in country_codes
    return given_name if country_exists else ""


def split_surname_given_name(sentence):
    """Split the sentence after country code into surname and given name."""
    parts = sentence.split('<', 1)
    surname = re.sub(r'\d', '', parts[0].strip())
    given_name_part = re.sub(r'\d', '', parts[1].strip()) if len(parts) > 1 else ""
    given_name = ' '.join(given_name_part.split('<')).strip()
    return surname, given_name


def extract_name(text):
    """Extract names based on predefined keywords."""
    text = text.lower()
    clean_text = re.sub(r'[^a-z\s]', '', text)
    for keyword in keywords:
        pattern = r'{}[\s:]*([a-z\s]+)'.format(re.escape(keyword.lower()))
        match = re.search(pattern, clean_text)
        if match:
            name = match.group(1).strip()
            name = re.sub(r'[^a-z\s]', '', name)
            name_parts = name.split()
            return ' '.join(name_parts[:2]) if len(name_parts) > 2 else name
    return "Name not found"


def process_pdf(pdf_path):
    """Convert PDF pages to images and return their paths."""
    images = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)
    image_paths = []
    for i, img in enumerate(images):
        img_path = os.path.join(UPLOAD_FOLDER, f"page_{i+1}.jpg")
        img.save(img_path, 'JPEG')
        image_paths.append(img_path)
    return image_paths


@app.route("/", methods=["GET", "POST"])
def index():
    extracted_data = None

    if request.method == "POST":
        if "file" not in request.files:
            return redirect(request.url)

        file = request.files["file"]

        if file.filename == "" or not allowed_file(file.filename):
            return redirect(request.url)

        filename = secure_filename(file.filename)
        file_path = os.path.join(app.config["UPLOAD_FOLDER"], filename)
        file.save(file_path)

        file_extension = filename.rsplit('.', 1)[1].lower()

        extracted_data = []
        if file_extension in {'jpg', 'jpeg', 'png', 'bmp', 'gif', 'tiff'}:
            text = extract_text(file_path)
            mrz_lines = parse_mrz_lines(text)
            if mrz_lines:
                for line in mrz_lines:
                    sentence = split_mrz_line(line)
                    surname, given_name = split_surname_given_name(sentence)
                    extracted_data.append({"Surname": surname, "Given Name": given_name})
            else:
                extracted_name = extract_name(text)
                extracted_data.append({"Extracted Name": extracted_name})

        elif file_extension == "pdf":
            image_paths = process_pdf(file_path)
            for image_path in image_paths:
                text = extract_text(image_path)
                mrz_lines = parse_mrz_lines(text)
                if mrz_lines:
                    for line in mrz_lines:
                        sentence = split_mrz_line(line)
                        surname, given_name = split_surname_given_name(sentence)
                        extracted_data.append({"Surname": surname, "Given Name": given_name})
                else:
                    extracted_name = extract_name(text)
                    extracted_data.append({"Extracted Name": extracted_name})

        return render_template("index.html", extracted_data=extracted_data)

    return render_template("index.html")


if __name__ == "__main__":
    app.run(debug=True)