In [1]:
import json
import os
import zipfile
import openai
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt
import pypdfium2 as pdfium
import base64
import requests
import re
import fitz 
from time import sleep

In [2]:
API_KEY = open("../../Desktop/OPEN_API_KEY.txt", "r").read()
API_URL = "https://api.openai.com/v1/chat/completions"

In [3]:
# Headers for OpenAI API
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

In [4]:
def encode_image_to_base64(image):
    """
    Encode image data to base64.
    """
    image_bytes = BytesIO()
    image.save(image_bytes, format='PNG')  # Correctly save as PNG
    image_bytes = image_bytes.getvalue()
    return base64.b64encode(image_bytes).decode('utf-8')

def query_openai_with_image(base64_image):
    """
    Send a base64 encoded image to OpenAI for transcription.
    """
    example_json = {
        "Number": "1",
        "Question": "What is the decimal representation of the hexadecimal number 123.4?",
        "With_figure": "boolen_value",
        "figure_path": "",
        "Choices": {
            "a":  "83.25", 
            "b": "83.5", 
            "c": "291.25", 
            "d": "291.5"
        },
        "Answer": "",
    }

    example_question_format = """Which of the following is the design for securing safety and reliability known as
                                “foolproof”?

                                (choices start here this, NOTE is not part of the format)
                                a) A redundant configuration, such as duplication, can be used so that even when a device
                                failure occurs, the functions of the overall system are not affected.
                                b) Even when the user performs an incorrect operation, a problem does not occur in the
                                system.
                                c) When a device failure occurs, damage can be minimized by enabling the system to stop
                                safely.
                                d) When a device failure occurs, the system runs even if the usable functions are restricted,
                                and the processing efficiency is reduced. """
    
    payload = {
        "model": "chatgpt-4o-latest",
        "messages": [
            {
                "role": "system",
                "content": f"Provide output in valid json format. The data schema should be {json.dumps(example_json)}"
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"Extract the questions and their corresponding choices in the image. All texts are part of the question as long as the choices which (a)statement... does not appear like this format {example_question_format}. If the image sent is an instruction, there is no need to extract the text, just return an empty string. A hint to know the page is not an instructions page is Q1. question as the first text when parsing the text in the image. A page can have more than 1 question does take note of it, so extract all question in a dictionary format"
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}",
                            "detail": "low"
                        }
                    }
                ]
            }
        ],
        "response_format" : {"type":"json_object"}
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [5]:
def convert_pdf_to_images(file_path, scale=300/72):
    
    pdf_file = pdfium.PdfDocument(file_path)  
    page_indices = [i for i in range(len(pdf_file))]
    
    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices = page_indices, 
        scale = scale,
    )
    
    list_final_images = [] 
    
    for i, image in zip(page_indices, renderer):
        
        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        list_final_images.append(dict({i:image_byte_array}))
    
    return list_final_images

def display_images(list_dict_final_images):
    
    all_images = [list(data.values())[0] for data in list_dict_final_images]

    for index, image_bytes in enumerate(all_images):

        image = Image.open(BytesIO(image_bytes))
        figure = plt.figure(figsize = (image.width / 100, image.height / 100))

        plt.title(f"----- Page Number {index+1} -----")
        plt.imshow(image)
        plt.axis("off")
        plt.show()

In [6]:
# pdf_file = "questions_pdf/2024S_FE-A_Questions.pdf"  # Specify your zip file

# images = convert_pdf_to_images (pdf_file)

In [7]:
# display_images(images)

In [8]:
def extract_json_from_response(response_text):
    """
    Extract and parse JSON content from the response text, handling non-standard formatting.
    """
    # Clean the response to remove common artifacts and formatting issues
    cleaned_text = re.sub(r'```json|```|\\n|\\t|\\r', '', response_text, flags=re.DOTALL).strip()

    # Use a simplified regex pattern to find JSON objects or arrays
    json_match = re.search(r'(\{.*?\}|\[.*?\])', cleaned_text, re.DOTALL)
    
    if json_match:
        json_content = json_match.group(0)
        try:
            # Attempt to parse the extracted JSON content
            parsed_json = json.loads(json_content)
            return parsed_json
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            print("Attempting additional cleanup...")

            # Further cleanup for common issues, such as unescaped quotes or incorrect commas
            json_content = json_content.replace('\n', '').replace('\\"', '"').replace("\\'", "'")
            json_content = re.sub(r',\s*([}\]])', r'\1', json_content)  # Remove trailing commas
            json_content = re.sub(r'([{[])\s*,', r'\1', json_content)  # Remove leading commas

            # Attempt parsing again with further cleaned content
            try:
                parsed_json = json.loads(json_content)
                return parsed_json
            except json.JSONDecodeError as e:
                print(f"Failed again to parse JSON: {e}")
                print(f"Final cleaned content: {json_content}")

                # Log problematic JSON content and errors for debugging
                with open("failed_json_debug.log", "a") as log_file:
                    log_file.write(f"Failed JSON:\n{json_content}\nError: {e}\n\n")

                return None
    else:
        print("No valid JSON found in the response.")
        return None

In [9]:
def save_missed_image(image, pdf_basename, page_number):
    """
    Save the missed image to a specified directory structure.
    """
    missed_pages_dir = "missed_pages"
    # Create the subdirectory named after the source file
    sub_dir = os.path.join(missed_pages_dir, pdf_basename)
    os.makedirs(sub_dir, exist_ok=True)

    # Save the image with the page number as the filename
    image_path = os.path.join(sub_dir, f"page{page_number}.png")
    image.save(image_path, format='PNG')
    print(f"Missed image for page {page_number} saved to {image_path}.")


In [10]:
def transcribe_with_retry(base64_image, retries=5, delay=2):
    """
    Attempt to transcribe the image with retries on failure.
    """
    for attempt in range(retries):
        try:
            response = query_openai_with_image(base64_image)
            response_text = response['choices'][0]['message']['content']
            parsed_json = extract_json_from_response(response_text)
            if parsed_json is not None:
                return parsed_json
        except (KeyError, requests.exceptions.RequestException) as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            sleep(delay)  # Wait before retrying
    return None

In [11]:
def main():
    START_PAGE = 1  # Define start page
    PDF_FILE = "questions_pdf/2024S_FE-A_Questions.pdf"  # Replace with your actual PDF file path
    JSON_DIR = "json_data"  # Directory to save JSON files

    # Get the total number of pages in the PDF to set END_PAGE dynamically
    with fitz.open(PDF_FILE) as pdf:
        END_PAGE = pdf.page_count  # Set END_PAGE to the last page number of the PDF

    # Ensure the JSON directory exists
    if not os.path.exists(JSON_DIR):
        os.makedirs(JSON_DIR)
    
    # Convert the PDF to low-quality images once, outside the loop
    images = convert_pdf_to_images(PDF_FILE)  # Extract images for the pages

    # Get the base name of the PDF file (without extension) for use in saving files
    pdf_basename = os.path.basename(PDF_FILE).replace('.pdf', '')
    
    for page_number in range(START_PAGE, END_PAGE + 1):
        print(f"Processing page {page_number}...")
        
        if page_number - 1 < len(images):
            # Load the image from the list using the correct page index
            image = Image.open(BytesIO(images[page_number - 1][page_number - 1]))
            base64_image = encode_image_to_base64(image)

            # Attempt to transcribe with retry logic
            parsed_json = transcribe_with_retry(base64_image)

            # Check if the parsed JSON has valid questions
            has_valid_questions = False
            if isinstance(parsed_json, list):
                # If parsed_json is a list, check each item
                has_valid_questions = any(
                    question.get("Question") for question in parsed_json if isinstance(question, dict)
                )
            elif isinstance(parsed_json, dict):
                # If parsed_json is a dict, check its values
                has_valid_questions = any(
                    question.get("Question") for question in parsed_json.values() if isinstance(question, dict)
                )

            if has_valid_questions:
                # Construct the filename using the source file name and page number
                json_filename = f"{pdf_basename}-page{page_number}.json"
                json_path = os.path.join(JSON_DIR, json_filename)

                # Save the parsed JSON to a file
                with open(json_path, 'w', encoding='utf-8') as json_file:
                    json.dump(parsed_json, json_file, indent=4)
                
                print(f"Transcribed text for page {page_number} saved to {json_filename}.")
            else:
                print(f"No valid questions found for page {page_number}. Skipping save.")
                save_missed_image(image, pdf_basename, page_number)  # Save the missed image

In [12]:
if __name__ == "__main__":
    main()

Processing page 1...
No valid questions found for page 1. Skipping save.
Missed image for page 1 saved to missed_pages/2024S_FE-A_Questions/page1.png.
Processing page 2...
No valid JSON found in the response.
Error parsing JSON: Expecting ',' delimiter: line 11 column 6 (char 178)
Attempting additional cleanup...
Failed again to parse JSON: Expecting ',' delimiter: line 1 column 169 (char 168)
Final cleaned content: {    "Number": "",    "Question": "",    "With_figure": false,    "figure_path": "",    "Choices": {        "a": "",        "b": "",        "c": "",        "d": ""    }
No valid JSON found in the response.
No valid questions found for page 2. Skipping save.
Missed image for page 2 saved to missed_pages/2024S_FE-A_Questions/page2.png.
Processing page 3...
No valid questions found for page 3. Skipping save.
Missed image for page 3 saved to missed_pages/2024S_FE-A_Questions/page3.png.
Processing page 4...
Error parsing JSON: Expecting ',' delimiter: line 11 column 4 (char 238)