In [28]:
import os
import json
import yaml
import boto3
import fitz  # PyMuPDF
import io
import shutil
from PIL import Image
from anthropic import Anthropic
from collections import OrderedDict

def load_config():
    with open('config/config.yaml', 'r') as file:
        return yaml.safe_load(file)

def init_aws_client(config):
    return boto3.client(
        's3',
        aws_access_key_id=config['aws']['aws_access_key_id'],
        aws_secret_access_key=config['aws']['aws_secret_access_key'],
        region_name=config['aws']['region_name']
    )

def init_claude(config):
    return Anthropic(
        api_key=config['anthropic']['claud_key']
    )

def get_s3_public_url(bucket_name, file_key, region):
    return f"https://{bucket_name}.s3.{region}.amazonaws.com/{file_key}"

def ensure_tmp_dir():
    """Create tmp directory in the working folder if it doesn't exist"""
    tmp_dir = os.path.join(os.getcwd(), 'tmp')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    else:
        # Clean existing files in tmp directory
        for filename in os.listdir(tmp_dir):
            file_path = os.path.join(tmp_dir, filename)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(f'Error deleting {file_path}: {e}')
    return tmp_dir

In [27]:
main()

Found 23 PDF files to process

Processing: ds/rag01/sources/PDF/AWS Practitioner Sylabus.pdf
Downloading ds/rag01/sources/PDF/AWS Practitioner Sylabus.pdf
Extracting text...
Error processing AWS Practitioner Sylabus.pdf:
Error type: AssertionError
Error message: Invalid item number: i='all'.

Processing: ds/rag01/sources/PDF/AWS Solution Architect Sylabus.pdf
Downloading ds/rag01/sources/PDF/AWS Solution Architect Sylabus.pdf
Extracting text...
Error processing AWS Solution Architect Sylabus.pdf:
Error type: AssertionError
Error message: Invalid item number: i='all'.

Processing: ds/rag01/sources/PDF/Advanced Project Management Sylabus.pdf
Downloading ds/rag01/sources/PDF/Advanced Project Management Sylabus.pdf
Extracting text...
Error processing Advanced Project Management Sylabus.pdf:
Error type: AssertionError
Error message: Invalid item number: i='all'.

Processing: ds/rag01/sources/PDF/BI Fin Ops Sylabus.pdf
Downloading ds/rag01/sources/PDF/BI Fin Ops Sylabus.pdf
Extracting text..

In [30]:
import os
import json
import yaml
import boto3
import io
from PIL import Image
from anthropic import Anthropic
from collections import OrderedDict
from botocore.exceptions import NoCredentialsError, ClientError

def load_config():
    """Load configuration from YAML file"""
    with open('config/config.yaml', 'r') as file:
        return yaml.safe_load(file)

def init_aws_client(config):
    """Initialize AWS S3 client"""
    return boto3.client(
        's3',
        aws_access_key_id=config['aws']['aws_access_key_id'],
        aws_secret_access_key=config['aws']['aws_secret_access_key'],
        region_name=config['aws']['region_name']
    )

def init_claude(config):
    """Initialize Claude client"""
    return Anthropic(
        api_key=config['anthropic']['claud_key']
    )

def get_s3_public_url(bucket_name, file_key, region):
    """Generate S3 public URL"""
    return f"https://{bucket_name}.s3.{region}.amazonaws.com/{file_key}"

def ensure_tmp_dir():
    """Create and clean temporary directory"""
    tmp_dir = os.path.join(os.getcwd(), 'tmp')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    else:
        for filename in os.listdir(tmp_dir):
            file_path = os.path.join(tmp_dir, filename)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(f'Error deleting {file_path}: {e}')
    return tmp_dir

In [31]:
def analyze_with_claude(client, file_path, file_type):
    """Send file to Claude for analysis"""
    prompt = """Analyze this document and extract the following information in JSON format. The document contains course syllabus information, likely in both Hebrew and English. Required JSON structure: { "course_name": "Course name in original language", "program_manager": "Look for 'מנהל התוכנית' or program manager", "instructors": [ { "name": "Instructor name", "role": "Role (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)", "title": "Professional title if available", "description": "Additional description or background" } ], "summary": "A comprehensive summary of the course content", "embedded_images": ["List of image references"], "full_text": "The complete text from the document" } Keep all text in its original language (Hebrew and English). If a field is not found, use null or empty array []."""

    try:
        with open(file_path, 'rb') as file:
            file_content = file.read()

        if file_type == 'pdf':
            # For PDFs
            message = client.messages.create(
                model="claude-3.5-sonnet",
                max_tokens=100000,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": "application/pdf",
                                    "data": base64.b64encode(file_content).decode('utf-8')
                                }
                            }
                        ]
                    }
                ]
            )
        else:
            # For images (jpg, jpeg, png)
            message = client.messages.create(
                model="claude-3.5-sonnet",
                max_tokens=100000,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            },
                            {
                                "type": "image",
                                "source": {
                                    "type": "base64",
                                    "media_type": f"image/{file_type}",
                                    "data": base64.b64encode(file_content).decode('utf-8')
                                }
                            }
                        ]
                    }
                ]
            )
        
        print("Raw Claude Response:", message.content[0].text[:500] + "...")
        response_text = message.content[0].text.strip()
        
        try:
            start = response_text.find('{')
            end = response_text.rfind('}') + 1
            if start != -1 and end != 0:
                json_str = response_text[start:end]
                print("Extracted JSON string:", json_str[:500] + "...")
                claude_response = json.loads(json_str)
            else:
                raise ValueError("No JSON found in response")
                
        except json.JSONDecodeError as e:
            print(f"JSON Parse Error: {str(e)}")
            print(f"Response text: {response_text}")
            raise
        
        return OrderedDict([
            ("course_name", claude_response.get("course_name", "")),
            ("program_manager", claude_response.get("program_manager", "")),
            ("instructors", claude_response.get("instructors", [])),
            ("summary", claude_response.get("summary", "")),
            ("embedded_images", claude_response.get("embedded_images", [])),
            ("full_text", claude_response.get("full_text", ""))
        ])
        
    except Exception as e:
        print(f"Error analyzing with Claude: {str(e)}")
        print(f"Full error context: ", e)
        return OrderedDict([
            ("course_name", ""),
            ("program_manager", ""),
            ("instructors", []),
            ("summary", f"Error analyzing content: {str(e)}"),
            ("embedded_images", []),
            ("full_text", "")
        ])

In [32]:
def process_files(config):
    """Main processing function"""
    s3_client = init_aws_client(config)
    claude_client = init_claude(config)
    
    tmp_dir = ensure_tmp_dir()
    json_output_dir = os.path.join(os.getcwd(), 'output', 'json')
    os.makedirs(json_output_dir, exist_ok=True)
    
    def list_all_files(bucket, prefix):
        """Recursively list all supported files"""
        files = []
        paginator = s3_client.get_paginator('list_objects_v2')
        
        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            if 'Contents' in page:
                for obj in page['Contents']:
                    file_ext = os.path.splitext(obj['Key'])[1].lower()
                    if file_ext in ['.pdf', '.jpg', '.jpeg', '.png']:
                        files.append(obj)
        
        return files

    files = list_all_files(
        config['aws']['upload_bucket_name'],
        config['aws']['upload_path']
    )
    
    print(f"Found {len(files)} files to process")
    
    for file_obj in files:
        file_key = file_obj['Key']
        file_name = os.path.basename(file_key)
        file_ext = os.path.splitext(file_name)[1].lower()[1:]
        
        print(f"\nProcessing: {file_key}")
        
        tmp_file_path = os.path.join(tmp_dir, file_name)
        json_filename = f"{os.path.splitext(file_name)[0]}.json"
        local_json_path = os.path.join(json_output_dir, json_filename)
        
        try:
            print(f"Downloading {file_key}")
            s3_client.download_file(
                config['aws']['upload_bucket_name'],
                file_key,
                tmp_file_path
            )

            print(f"Analyzing {file_ext.upper()} with Claude...")
            analysis = analyze_with_claude(claude_client, tmp_file_path, file_ext)
            
            final_output = OrderedDict([
                ("file_url", get_s3_public_url(
                    config['aws']['upload_bucket_name'],
                    file_key,
                    config['aws']['region_name']
                )),
                ("file_path", file_key),
                ("file_name", file_name),
                ("file_type", file_ext)
            ])
            final_output.update(analysis)
            
            # Save locally
            with open(local_json_path, 'w', encoding='utf-8') as f:
                json.dump(final_output, f, ensure_ascii=False, indent=2)
            
            # Upload to S3
            json_key = os.path.join(
                config['aws']['extract_txt_path'],
                json_filename
            ).replace('\\', '/')
            
            s3_client.put_object(
                Bucket=config['aws']['txt_extract_bucket_name'],
                Key=json_key,
                Body=json.dumps(final_output, ensure_ascii=False, indent=2).encode('utf-8')
            )

            print(f"Successfully processed: {file_name}")
            print(f"JSON saved: {local_json_path}")
            print(f"Uploaded to: s3://{config['aws']['txt_extract_bucket_name']}/{json_key}")

        except Exception as e:
            print(f"Error processing {file_name}:")
            print(f"Error type: {type(e).__name__}")
            print(f"Error message: {str(e)}")
        
        finally:
            if os.path.exists(tmp_file_path):
                try:
                    os.remove(tmp_file_path)
                except Exception as e:
                    print(f"Error cleaning up {tmp_file_path}: {e}")

In [33]:
def main():
    """Main entry point"""
    try:
        config = load_config()
        process_files(config)
    except Exception as e:
        print(f"Main process error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Found 32 files to process

Processing: ds/rag01/sources/JPG/AWS Practitioner Sylabus.jpg
Downloading ds/rag01/sources/JPG/AWS Practitioner Sylabus.jpg
Analyzing JPG with Claude...
Error analyzing with Claude: name 'base64' is not defined
Full error context:  name 'base64' is not defined
Successfully processed: AWS Practitioner Sylabus.jpg
JSON saved: C:\github_repos\BIU_LLM_Project\output\json\AWS Practitioner Sylabus.json
Uploaded to: s3://ct-external-sources/ds/rag01/extract/AWS Practitioner Sylabus.json

Processing: ds/rag01/sources/JPG/AWS Solution Architect Sylabus.jpg
Downloading ds/rag01/sources/JPG/AWS Solution Architect Sylabus.jpg
Analyzing JPG with Claude...
Error analyzing with Claude: name 'base64' is not defined
Full error context:  name 'base64' is not defined
Successfully processed: AWS Solution Architect Sylabus.jpg
JSON saved: C:\github_repos\BIU_LLM_Project\output\json\AWS Solution Architect Sylabus.json
Uploaded to: s3://ct-external-sources/ds/rag01/extract/AWS Solu

In [None]:
if __name__ == "__main__":
    main()

'\ndef main():\n    config = load_config()\n    process_files(config)\n\nif __name__ == "__main__":\n    main()\n'