In [None]:
import fitz
import pytesseract
from PIL import Image, ImageEnhance
import cv2
import numpy as np
import re

def preprocess_pdf(pdf_path):
    images = []
    doc = fitz.open(pdf_path)
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # Enhance contrast
        #enhanced_img = enhance_contrast(img)

        # Smooth the image using Gaussian blur
        smoothed_img = smooth_image(img)

        images.append(smoothed_img)
    return images


def enhance_contrast(image, enhancement_factor=1.5):
    enhancer = ImageEnhance.Contrast(image)
    enhanced_image = enhancer.enhance(enhancement_factor)
    return enhanced_image


def smooth_image(image, sigma=0.5):
    # Convert the image to numpy array
    img_array = np.array(image)
    # Apply Gaussian blur filter
    blurred_img = cv2.GaussianBlur(img_array, (0, 0), sigma)
    # Convert the blurred image back to PIL Image
    smoothed_image = Image.fromarray(blurred_img)
    return smoothed_image



# Step 2: Text Extraction
def extract_text_from_images(images):
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img)
    return text

# Step 3: Material Detection
def detect_materials(text):
    materials = []
    lines = text.split('\n')
    for i, line in enumerate(lines):
        if "Customer material no." in line:
            print("Found line containing 'Customer material no.':", line)
            material_number = None
            qty_price_info = None

            # Extract material number
            material_info = line.split()
            for word in material_info:
                if word.isdigit():
                    material_number = word
                    print("Extracted material number:", material_number)
                    break

            # Extract quantity and price information from subsequent lines
            for j in range(i + 1, min(i + 6, len(lines))):  # Search up to 5 lines after the line containing "Customer material no."
                line = lines[j]
                if any(keyword in line.lower() for keyword in
                       ["pcs", "bur", "pcs"]):  # Assuming keywords to identify quantity and price
                    print("Found line containing quantity and price information:", line)
                    qty_price_info = preprocess_qty_price_info(line)
                    print("Split qty and price information:", qty_price_info)
                    break

            # If both material number and quantity-price info are found and qty_price_info is not empty, append to materials list
            if material_number and qty_price_info and len(qty_price_info) >= 4:
                quantity = qty_price_info[0] if qty_price_info else None
                price = qty_price_info[2] if len(qty_price_info) >= 2 else None
                price_unit = qty_price_info[4] if len(qty_price_info) >= 6 else None
                value = qty_price_info[-1] if qty_price_info else None
                materials.append({
                    "material_number": material_number,
                    "Qty": quantity,
                    "Price": price,
                    "Price_unit": price_unit,
                    "Value": value
                })
                print("Extracted material information:", materials[-1])
    return materials

def preprocess_qty_price_info(line):
    # Remove space between '.' or ',' and the following digit
    line = re.sub(r'([.,])\s+(\d)', r'\1\2', line)
    # Split the line into words
    words = line.split()
    return words


# Step 4: Invoice Price Extraction
def extract_invoice_price(text):
    invoice_price = None
    # Find lines containing total invoice price
    for line in text.split('\n'):
        if "Final" in line and "amount" in line:
            words = line.split()
            for i, word in enumerate(words):
                if word.lower() in ["Final", "amount"]:
                    try:
                        potential_price = words[i+1]
                        # Extract the potential price directly
                        invoice_price = potential_price
                    except IndexError:
                        pass
    return invoice_price



# Step 5: Example usage
pdf_path = "C:\\Users\\emy7u\\Downloads\\Lear Corporation Invoice PDF.pdf"
images = preprocess_pdf(pdf_path)
text = extract_text_from_images(images)
#print(text)
materials = detect_materials(text)
invoice_price = extract_invoice_price(text)

# Step 6: Output
print("Detected Materials:")
for material in materials:
    print(material)
print("Invoice Price:", invoice_price)