In [1]:
import os
import io

import yaml
import json
from collections import OrderedDict

import boto3
from botocore.exceptions import NoCredentialsError, ClientError

import base64  # Added base64 import
from PIL import Image
import fitz  # PyMuPDF

from openai import OpenAI
from anthropic import Anthropic

In [2]:
print("Current working directory:", os.getcwd())

Current working directory: C:\github_repos\BIU_LLM_Project


In [3]:
def load_config():
    """Load configuration from YAML file"""
    with open('config/config.yaml', 'r') as file:
        return yaml.safe_load(file)

In [4]:
def init_aws_client(config):
    """Initialize AWS S3 client"""
    return boto3.client(
        's3',
        aws_access_key_id=config['aws']['aws_access_key_id'],
        aws_secret_access_key=config['aws']['aws_secret_access_key'],
        region_name=config['aws']['region_name']
    )

In [5]:
def get_s3_public_url(bucket_name, file_key, region):
    """Generate S3 public URL"""
    return f"https://{bucket_name}.s3.{region}.amazonaws.com/{file_key}"

In [6]:
def ensure_tmp_dir():
    """Create and clean temporary directory"""
    tmp_dir = os.path.join(os.getcwd(), 'tmp')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    else:
        for filename in os.listdir(tmp_dir):
            file_path = os.path.join(tmp_dir, filename)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(f'Error deleting {file_path}: {e}')
    return tmp_dir

In [7]:
def extract_text_from_file(file_path):
    """
    Extract both regular text and text from images in an input file.
    """
    
    # Open the PDF document with PyMuPDF (fitz)
    doc = fitz.open(file_path)
    text_content = []  # List to store extracted text and image placeholders
    
    # Loop through each page in the document
    for page_num in range(len(doc)):
        page = doc[page_num]  # Access a specific page in the PDF
        
        # Extract regular text from the page and add to text content list
        text_content.append(page.get_text())
        
        # Extract all images on the page
        image_list = page.get_images()
        for img_index, img in enumerate(image_list):
            try:
                # Get the image reference number (xref) to retrieve the image data
                xref = img[0]
                base_image = doc.extract_image(xref)  # Extracts image data (binary format)
                image_bytes = base_image["image"]  # Retrieves raw image bytes
                
                # Convert the image bytes to a PIL Image object (useful for OCR or further processing)
                image = Image.open(io.BytesIO(image_bytes))  # Assumes PIL.Image and io are imported
                
                # Placeholder text to indicate an embedded image in the output
                text_content.append(f"[Embedded Image {page_num + 1}-{img_index + 1}]")
                
            except Exception as e:
                # Print error message if image extraction fails
                print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
    
    # Close the document to free up resources
    doc.close()
    
    # Join all text content (including image placeholders) into a single string for output
    return "\n".join(text_content)

In [8]:
def init_openai(config):
    """Initialize OpenAI API key from configuration."""
    openai.api_key = config['openai']['chat_gpt_key']

In [9]:
def analyze_with_openai_eng(text):

    """Analyze document content using OpenAI's GPT-4 and return structured JSON data."""
    
    # Initial processing prompt for the full text extraction and transformation
    try:
        chat_history = [
            {"role": "user",
             "content": """
                Analyze the following text started after @@@@.
                This text is a course syllabus document that contains the course information.
                Your mission is to process and extract the information in JSON file format.
        
                Required JSON structure: { "full_text": "The complete text after processing", "full_text_no_tokens": "Number tokens that GPT-4 returns" }
        
                HOW TO PROCESS THE FULL TEXT
                1. Read the text very carefully.
                2. Text may be in (Hebrew, English or mixed).
                3. Preserve all language terms (Hebrew, English, or mixed) in their original language.
                4. You can use all available response tokens for output.
            
                5. For the full_text field, apply these transformations:
                   - Add topic sentences to each major section.
                   - Add clear section headers in capital letters.
                   - Normalize spacing and formatting.
                   - End all sentences with proper punctuation.
                   - Transform bullet points into complete sentences.
                   - Create proper paragraphs from lists.
                   - Add contextual transitions between sections.
                   - Ensure hierarchical content structure.
                   - Preserve important dates, numbers, and contact information.
                   - Remove duplicate headers/footers.
                   - Remove any text that says: "[Embedded Image ]" or similar.
                   - Maintain consistent formatting throughout.
    
                6. Structure the text in sections as follows:
                   - COURSE NAME
                   - COURSE OVERVIEW
                   - TARGET AUDIENCE
                   - COURSE LEADERSHIP
                   - PREREQUISITES
                   - CERTIFICATIONS AWARDED
                   - COURSE SCHEDULE
                   - LEARNING FORMAT
                   - TUITION AND FEES
                   - COURSE TOPICS
                   - REGISTRATION AND CONTACT INFORMATION
                   - INSTITUTIONAL INFORMATION
    
                7. Writing style guidelines:
                   - Use complete sentences.
                   - Add context to technical terms.
                   - Maintain consistent tense and voice.
                   - Ensure clarity and readability.
                   - Preserve all relevant information from the original.
    
                8. Formatting requirements:
                   - Use consistent paragraph spacing.
                   - Ensure proper nesting of information.
                   - Keep important identifiers (emails, URLs, phone numbers) in their original format.
            
                Text to analyze after this @@@@
                """
            }
        ]
   
        client = OpenAI()
        
        # Send prompt01 to OpenAI GPT-4 for initial processing
        response = client.chat.completion.create(
            model="gpt-4",
            messages=[
                {f"{chat_history} \n {text}"}
            ],
            max_tokens=4000,
            temperature=0  # Set temperature to 0 for deterministic output
        )
        
        # Extract response text from prompt01
        response_text1 = response.choices[0].message.content.strip()
        usage = response['usage']
        # Extract JSON structure from the response of prompt01
        start = response_text1.find('{')
        end = response_text1.rfind('}') + 1
        
        if start != -1 and end != 0:
            json_str1 = response_text1[start:end]
            initial_data = json.loads(json_str1)
            response_tokens = usage['completion_tokens']
            
        else:
            raise ValueError("No JSON found in response")

        # Second prompt for detailed extraction based on the initial processed text

        chat_history = [
            {"role": "user",
             "content": """           
                From the following course text after @@@@, extract these specific details:
                1. The complete course name exactly as it appears
                
                2. From COURSE LEADERSHIP section:
                   - Create a LIST of ALL instructors, including every teaching staff member mentioned
                   - For each instructor in the list, extract:
                     * Full name as written
                     * Their role/position (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)
                     * Professional title (if mentioned)
                     * Any additional description or background information
                   - Make sure to capture EVERY instructor mentioned, even if some details are missing
                   - List format should handle multiple instructors
                
                3. From the entire text:
                   - Create a comprehensive course summary
                   - Do not use text from INSTITUTIONAL INFORMATION or motto.
                
                Return the information in this exact JSON format:
                {
                    "course_name": "Complete course name as it appears",
                    "instructors": [
                        {
                            "name": "First instructor name",
                            "role": "Role (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)",
                            "title": "Professional title if available",
                            "description": "Additional description"
                        }
                        // Continue for all instructors found
                    ],
                    "summary": "Comprehensive course summary"
                }
        
                Important notes:
                1. Keep all text in its original language (Hebrew and English)
                2. Use null for missing fields
                3. Use empty array [] if no instructors are found
                4. Include ALL instructors mentioned in the text
                5. Maintain list format even if only one instructor is found
        
                Text to analyze after this @@@@
                """
            }
        ]        
        # Send prompt02 to OpenAI GPT-4 for detailed extraction
        response = client.chat.completion.create(
            model="gpt-4",
            messages=[
                {f"{chat_history} \n {initial_data.get('full_text', '')}"}
            ],
            max_tokens=4000,
            temperature=0
        )
        
        # Extract response text from prompt02
        response_text2 = response.choices[0].message.content.strip()
        extract_start = response_text2.find('{')
        extract_end = response_text2.rfind('}') + 1
        
        if extract_start != -1 and extract_end != 0:
            extracted_data = json.loads(response_text2[extract_start:extract_end])
        else:
            extracted_data = {
                "course_name": None,
                "instructors": [],
                "summary": None
            }

        # Create final JSON output
        final_json = {
            "course_name": extracted_data.get("course_name"),
            "instructors": extracted_data.get("instructors", []),
            "summary": extracted_data.get("summary"),
            "full_text": initial_data.get("full_text", text),
            "full_text_no_tokens": response_tokens
        }

        return final_json

    except (json.JSONDecodeError, ValueError) as e:
        print(f"JSON Parse Error: {str(e)}")
        print(f"Response text: {response_text1 if 'response_text1' in locals() else response_text2}")
        return {
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0
        }
    
    except Exception as e:
        print(f"Error analyzing with OpenAI: {str(e)}")
        return {
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0
        }


In [10]:
def init_claude(config):
    return Anthropic(
        api_key=config['anthropic']['claud_key']
    )

In [11]:
def process_files(config):
    """Main processing function to handle file operations using AWS S3 and the Claude service."""
    
    # Initialize AWS S3 client using provided configuration
    s3_client = init_aws_client(config)

    # Ensure the existence of a temporary directory for intermediate file storage
    tmp_dir = ensure_tmp_dir()  
           
    # Set up the JSON output directory path in the 'data/documents' folder relative to the current working directory
    json_output_dir_eng = os.path.join(os.getcwd(), 'data', 'documents', 'eng')
    os.makedirs(json_output_dir_eng, exist_ok=True)  

    # Set up the JSON output directory path in the 'data/documents' folder relative to the current working directory
    json_output_dir_heb = os.path.join(os.getcwd(), 'data', 'documents', 'heb')
    os.makedirs(json_output_dir_heb, exist_ok=True) 
    
    # Initialize Claude client using provided configuration
    claude_client = init_claude(config)

    # Initialize Claude client using provided configuration
    init_openai(config)

    # Define supported file types to process
    supported_file_types = tuple(config['local']['supported_file_types'])
    
    # List all files in AWS S3 upload bucket/path using the S3 paginator
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(
        Bucket=config['aws']['upload_bucket_name'],
        Prefix=config['aws']['upload_path']
    )

    # Loop through all files in each page from the S3 bucket
    for page in pages:
        for obj in page.get('Contents', []):
            s3_file_key = obj['Key']
            file_name = os.path.basename(s3_file_key)
            
            # Process only supported file types
            if not file_name.lower().endswith(supported_file_types):
                continue

            print(f"\nProcessing file: {file_name}")
            
            # Create paths for the temporary PDF and output JSON
            tmp_download_file_path = os.path.join(tmp_dir, file_name)

            # Create output JSON english
            json_filename = f"{os.path.splitext(file_name)[0]}.json"
            local_json_path_eng = os.path.join(json_output_dir_eng, json_filename)

            # Create output JSON hebrew
            json_filename = f"{os.path.splitext(file_name)[0]}.json"
            local_json_path_heb = os.path.join(json_output_dir_heb, json_filename)
            
            try:
                # Download file to temporary location
                s3_client.download_file(
                    config['aws']['upload_bucket_name'], s3_file_key,
                    tmp_download_file_path
                )
                
                # Extract text from the file (including image references)
                extracted_text = extract_text_from_file(tmp_download_file_path)
                print(f"\nExtracted Text:\n{extracted_text}")

                # Analyze text with Claude - english
                # analysis_eng = analyze_with_claude_eng(claude_client, extracted_text)

                # Analyze text with OpenAi - english
                analysis_eng = analyze_with_openai_eng(extracted_text)
                
                print(f"\nLLM Analysis eng 1:\n{analysis_eng}")                      

                # Save analysis to JSON output file
                with open(local_json_path_eng, 'w') as json_file:
                    json.dump(analysis_eng, json_file)

                print(f"Saved analysis to {local_json_path_eng}")

                '''
                # Analyze text with Claude - hebrew
                analysis_heb = analyze_with_claude_heb(claude_client, extracted_text)
                print(f"\nLLM Analysis heb:\n{analysis_heb}")

                # Save analysis to JSON output file
                with open(local_json_path_heb, 'w') as json_file:
                    json.dump(analysis_heb, json_file)

                print(f"Saved analysis to {local_json_path_heb}")
                '''

            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")



In [12]:
def main():
    """Main entry point"""
    try:
        config = load_config()
        process_files(config)
    
    except Exception as e:
        print(f"Main process error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Main process error: name 'openai' is not defined


NameError: name 'openai' is not defined