In [1]:
import os
import time
import json  # Import JSON library
from openai import OpenAI
from pdf2image import convert_from_path  # pdf2image for extracting images
from PIL import Image  # Python Imaging Library to handle images
import base64
from io import BytesIO
import fitz  # PyMuPDF for extracting text from PDFs


# Function to list new PDF files
def get_new_pdfs(directory, processed_files):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                if pdf_path not in processed_files:
                    pdf_files.append(pdf_path)
    return pdf_files

# Function to extract text and images from a PDF file
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)  # open a file
    text = ""
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        text += page.get_text()  # extract text from page
    document.close()
    if text == '':
        images = convert_from_path(pdf_path)  # Use pdf2image to extract images
    else:
        images = []
    return text, images

def summarize_text(text, api_key):
    # Set up OpenAI API key
    client = OpenAI(api_key=api_key)
    
    # Define the prompt to summarize the text
    prompt = f"Summarize the following text:\n\n{text}"
    
    try:
        response = client.chat.completions.create(
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            model="gpt-4o"
        )
        
        # Extract GPT's response
        return response.choices[0].message.content
    
    except Exception as e:
        return f"Error: {e}"


def encode_image(image):
    
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    b64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
    return b64_image


def summarize_images(image, api_key):
    # Set up OpenAI API key
    client = OpenAI(api_key=api_key)
    
    base64_image = encode_image(image)  # Encode each image to base64
    img_type = "image/png"
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "give a brief description of this image"},
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:{img_type};base64,{base64_image}"},
                        },
                    ],
                }
            ],
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error processing image: {e}"


# Function to manage the processed file paths in a JSON file
def get_processed_files(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return json.load(file)  # Load JSON data
    return {}

def set_processed_files(file_path, processed_files):
    with open(file_path, 'w') as file:
        json.dump(processed_files, file)  # Save JSON data

# Main function to process new PDFs
def summarize_new_pdfs(directory, state_file_path, api_key, polling_interval=2):
    processed_files = get_processed_files(state_file_path)

    while True:
        pdf_files = get_new_pdfs(directory, processed_files)
        if not pdf_files:
            print("No new PDFs found.")
            break
        for pdf_path in pdf_files:
            print(f"Processing: {pdf_path}")
            text, images = extract_text_from_pdf(pdf_path)
            base_name = os.path.splitext(os.path.basename(pdf_path))[0]  # Get the filename without extension
            
            if text:
                text_summary = summarize_text(text, api_key)
                # Save the summary to a text file in the same directory
                summary_file_path = os.path.join(os.path.dirname(pdf_path), f"{base_name}_summary.txt")
                with open(summary_file_path, 'w') as summary_file:
                    summary_file.write(text_summary)
                print(f"Saved summary to: {summary_file_path}\n")
            else:
                print("No text found in the PDF.")
                
            # Save images if there are any
            if images:
                images_dir = os.path.join(os.path.dirname(pdf_path), f"{base_name}_images")
                os.makedirs(images_dir, exist_ok=True)

                for i, img in enumerate(images):
                    # Summarize the image
                    image_summary = summarize_images(img, api_key)
                    
                    # Save the image
                    img_path = os.path.join(images_dir, f"image_{i}.png")
                    img.save(img_path)
                    print(f"Saved image to: {img_path}")

                    # Save the summary of the image
                    summary_path = os.path.join(images_dir, f"image_{i}_summary.txt")
                    with open(summary_path, "w") as summary_file:
                        summary_file.write(image_summary)  # Assuming only one summary is needed

                    print(f"Saved summary to: {summary_path}")

            processed_files[pdf_path] = True  # Mark the file as processed
            set_processed_files(state_file_path, processed_files) 

        # Sleep before checking again
        time.sleep(polling_interval)

In [2]:
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')
directory_path = 'attachments'
state_file_path = 'last_processed_file.json'  # File to track last processed PDF file

print("Monitoring for new PDFs...")
summarize_new_pdfs(directory_path, state_file_path,api_key)

Monitoring for new PDFs...
Processing: attachments/hassaan.naeem@gosaas.io/mic check/sample.pdf
Saved summary to: attachments/hassaan.naeem@gosaas.io/mic check/sample_summary.txt

Processing: attachments/arezshahid@gosaas.io/aaa/degree & transcript.pdf
No text found in the PDF.
Saved image to: attachments/arezshahid@gosaas.io/aaa/degree & transcript_images/image_0.png
Saved summary to: attachments/arezshahid@gosaas.io/aaa/degree & transcript_images/image_0_summary.txt
Saved image to: attachments/arezshahid@gosaas.io/aaa/degree & transcript_images/image_1.png
Saved summary to: attachments/arezshahid@gosaas.io/aaa/degree & transcript_images/image_1_summary.txt
No new PDFs found.
