In [None]:
# Specify the folder path containing the PDF files
# Inputs are pdf or pdfs 
# Output are unique pgns 
folder_path = '/Users/klausgarridotenorio/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Experiments/Data_cleaning/pdfs_to convert'
output_folder = '/Users/klausgarridotenorio/Library/CloudStorage/OneDrive-ErasmusUniversityRotterdam/Experiments/Data_cleaning/jpgs_converted'

In [334]:
import os
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image
import ollama


# Get a list of all PDF files in the folder
pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]

# Convert each PDF file to PNG
for pdf_file in pdf_files:
    # Get the full path of the PDF file
    pdf_path = os.path.join(folder_path, pdf_file)
    
    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    
    # Loop through all the pages in the PDF
    for page_num in range(pdf_document.page_count):
        # Select the page
        page = pdf_document.load_page(page_num)
        
        # Render the page to an image (resolution 150 dpi)
        pix = page.get_pixmap(dpi=150)
        
        # Create the output file path, ensuring each page gets a unique file name
        output_file = os.path.splitext(pdf_file)[0] + f'_page{page_num + 1}.png'
        output_path = os.path.join(output_folder, output_file)
        
        # Save the image as PNG
        pix.save(output_path)
        
        # Rotate the saved PNG image 90 degrees to the right
        img = Image.open(output_path)
        rotated_img = img.rotate(-90, expand=True)  # Rotate 90 degrees clockwise
        rotated_img.save(output_path)  # Overwrite the original file with the rotated image
    
    # Close the PDF document
    pdf_document.close()



In [335]:
System_prompt="""You are an expert at interpreting information such as IBAN numbers, amounts, names, addresses, house numbers, postal codes, cities, and dates of birth from a payment slip.

You have been provided with a payment slip in raw text format. Your task is to extract and organize the following information from the payment slip into a structured format:

- **IBAN_Number**: The IBAN (International Bank Account Number) is a unique identifier for a bank account and should contain 18 to 34 characters depending on the country.
- **Amount**: The monetary amount in EUR, which should be in numeric format with two decimal places (e.g., '3.46').
- **Name**: The full name of the individual.
- **street**: The street address including street name (e.g., 'Mariniersweg') do not ever nest the address with other address details respective sub-keys: `House number`, `Postal code`, and `City`.
- **House Number**: The specific house or apartment number, which could include spaces or letters (e.g., '32B').
- **Postal Code**: The postal code of the payer, which is normally 4 numbers and two letters (e.g., '3014 NP').
- **City**: The city where the payer resides.
- **Date of Birth**: The payers date of birth in the format 'DD/MM/YYYY'.

You will receive the information in raw form, and your task is to parse it into dictionary structure as in the following example:

Example of raw data:
THIS IS A RECEIPT CONFIRMATION FOR THE EUR’S FI...
INFORMATION WILL NEVER BE COUPLED TO YOUR RESPO...
                                              Name
                                           Address
                                      House number
                                       Postal code
                                              City
                                     Klaus Garrido
                                      Mariniersweg
                                           6723 TL
                                         Rotterdam
                                             532 B
                                            Amount
                               IBAN (Bank account)
                                        BSN number
                                     Date of birth
                                         Signature
                                        (optional)
                          NL32 ABNA 7330 6416 7385
                                              5,99
                                        16/06/2005  
                                  Date experiment:

Example of dictionary structure (Expected Output):
```python
{
  'IBAN_Number': 'NL32 ABNA 7330 6416 7385',
  'Amount': '5.99',
  'Name': 'Klaus Garrido',
  'Street': 'Mariniersweg',
  'House Number': '532 B',
  'Postal Code': '6723 TL',
  'City': 'Rotterdam',
  'Date of Birth': '16/06/2005'
}
```

Please note that:
1. The **IBAN_Number** must be a valid IBAN format of the appropriate length.
2. The **Amount** must be formatted as a float with two decimal places.
3. The **Date of Birth** should follow the 'DD/MM/YYYY' format.




The user will provide the raw format, and you will respond with the structured format only. Do **not** include any additional text such as 'Here is the extracted data in a structured format:' or 'Let me know if you need anything else!'. Simply output the structured dictionary."
"""

In [None]:
import os
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor
import re
import json
import pandas as pd

# Initialize Final_data_processed as an empty DataFrame before the loop
Final_data_processed = pd.DataFrame()

# Get a list of all .png files in the output folder
image_files = [file for file in os.listdir(output_folder) if file.lower().endswith('.png')]

# Load the models and processors once before the loop
langs = ["en"]  # Replace with your languages - optional but recommended
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

# Process each image file
for i, image_file in enumerate(image_files):
    # Get the full path of the image file
    image_path = os.path.join(output_folder, image_file)
    
    try:
        # Load the image
        image = Image.open(image_path)
        
        # Run OCR on the image
        predictions = run_ocr(
            images=[image],
            langs=[langs],
            det_model=det_model,
            det_processor=det_processor,
            rec_model=rec_model,
            rec_processor=rec_processor
        )
        
        # Extract data into a DataFrame
        data = {
            'polygon': [text_line.polygon for text_line in predictions[0].text_lines],
            'confidence': [text_line.confidence for text_line in predictions[0].text_lines],
            'text': [text_line.text for text_line in predictions[0].text_lines],
            'bbox': [text_line.bbox for text_line in predictions[0].text_lines]
        }
        df = pd.DataFrame(data)
        
        # Prepare the system prompt and user prompt for the chat model
        system_prompt = (
            "You are a helpful assistant that extracts structured information from text. "
            "When given text data, you will extract the following fields if present: "
            "IBAN, Amount, Name, Address, House number, Postal code, City, Date of birth. "
            "Provide the extracted information in a JSON format with the specified keys."
        )
        
        user_prompt = (
            "Please extract the required information from the following text:\n\n"
            + "\n".join(df['text'].tolist())
            + "\n\n"
            "Provide the output strictly in JSON format with the keys: "
            "'IBAN', 'Amount', 'Name', 'Address', 'House number', 'Postal code', 'City', 'Date of birth'."
        )
        
        # Get the response from the language model
        response = ollama.chat(
            model='llama3',
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        # Extract the JSON data from the response
        try:
            # Assuming the model returns the JSON directly
            response_content = response['message']['content']
            
            # Use a regular expression to extract the JSON object from the response
            json_match = re.search(r'(\{.*\})', response_content, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)
                # Load the JSON data
                data_dict = json.loads(json_str)
            else:
                raise ValueError("No JSON object found in the response.")
            
        except json.JSONDecodeError as e:
            # Handle JSON decoding errors
            print(f"JSON decoding error: {e}")
            raise ValueError("Invalid JSON format in the response.")
        
        # Ensure all required keys are present in the dictionary
        required_keys = ['IBAN', 'Amount', 'Name', 'Address', 'House number', 'Postal code', 'City', 'Date of birth']
        # Initialize missing keys with None
        for key in required_keys:
            if key not in data_dict:
                data_dict[key] = None
        
        # Check if all required values are present (non-empty and not None)
        is_complete = all(data_dict.get(key) for key in required_keys)
        
        # Convert the dictionary into a DataFrame with a single row
        new_row_df = pd.DataFrame([data_dict])
        
        # Add the "complete" column: 1 if complete, 0 if any value is missing
        new_row_df['complete'] = int(is_complete)
        
        # Optionally, add the image file name for reference
        new_row_df['image_file'] = image_file
        
        # Append the new row to the existing DataFrame using pd.concat
        Final_data_processed = pd.concat([Final_data_processed, new_row_df], ignore_index=True)
        
    except Exception as e:
        # Handle the error
        print(f"Error processing image {image_file}: {e}")
        # Create a dictionary with None or default values
        failed_dict = {key: None for key in required_keys}
        failed_dict['complete'] = 0  # Indicate failure
        failed_dict['image_file'] = image_file
        # Convert to DataFrame
        failed_row_df = pd.DataFrame([failed_dict])
        # Append to Final_data_processed
        Final_data_processed = pd.concat([Final_data_processed, failed_row_df], ignore_index=True)
        
    # Print the progress
    print(f"Progress ={(i+1)/len(image_files)} aka detail {i+1}/{len(image_files)}: {image_file}")

In [None]:
Final_data_processed