In [None]:
import os
import json
import yaml
import boto3
from langchain.llms import Anthropic
from langchain_community.chat_models import ChatAnthropic
import pytesseract
import pdf2image
import fitz  # PyMuPDF
from PIL import Image
import io
import tempfile
import cv2
import numpy as np

def load_config():
    with open('config/config.yaml', 'r') as file:
        return yaml.safe_load(file)

def init_aws_client(config):
    return boto3.client(
        's3',
        aws_access_key_id=config['aws']['aws_access_key_id'],
        aws_secret_access_key=config['aws']['aws_secret_access_key'],
        region_name=config['aws']['region_name']
    )

def init_claude(config):
    return ChatAnthropic(
        anthropic_api_key=config['anthropic']['claud_key'],
        model="claude-3-sonnet-20240229"
    )

def download_file(s3_client, bucket, key):
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
        s3_client.download_file(bucket, key, tmp_file.name)
        return tmp_file.name

def extract_text_from_image(image):
    """Extract text from a PIL Image object using Tesseract with Hebrew support"""
    # Convert PIL Image to OpenCV format
    opencv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    
    # Image preprocessing
    gray = cv2.cvtColor(opencv_img, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray)
    
    # Adaptive thresholding
    thresh = cv2.adaptiveThreshold(
        denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
        cv2.THRESH_BINARY, 11, 2
    )
    
    # Convert back to PIL Image
    processed_img = Image.fromarray(thresh)
    
    # Extract text with Tesseract, supporting both Hebrew and English
    text = pytesseract.image_to_string(
        processed_img, 
        lang='eng+heb',  # Use both English and Hebrew languages
        config='--psm 3'  # Fully automatic page segmentation
    )
    
    return text.strip()

def extract_text_from_pdf(pdf_path):
    """Extract text and images from PDF"""
    all_text = []
    
    # Open PDF with PyMuPDF
    doc = fitz.open(pdf_path)
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # Extract text directly from PDF
        text = page.get_text()
        all_text.append(text)
        
        # Extract images from PDF
        image_list = page.get_images()
        
        for img_index, img in enumerate(image_list):
            try:
                # Get image data
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                
                # Convert to PIL Image
                image = Image.open(io.BytesIO(image_bytes))
                
                # Extract text from image
                image_text = extract_text_from_image(image)
                if image_text:
                    all_text.append(image_text)
                    
            except Exception as e:
                print(f"Error extracting image {img_index} from page {page_num}: {str(e)}")
                
    doc.close()
    return "\n".join(all_text)

def extract_text_from_image_file(image_path):
    """Extract text from image file"""
    try:
        with Image.open(image_path) as img:
            return extract_text_from_image(img)
    except Exception as e:
        print(f"Error processing image file: {str(e)}")
        return ""

def analyze_with_claude(llm, text):
    prompt = """
    Analyze this document and extract the following information in JSON format.
    The text contains course syllabus information, likely in both Hebrew and English.
    
    Required JSON structure:
    {
        "course_name": "Course name in original language",
        "program_manager": "Look for 'מנהל התוכנית' or program manager",
        "instructors": [
            {
                "name": "Instructor name",
                "role": "Role (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)",
                "title": "Professional title if available",
                "description": "Additional description or background"
            }
        ],
        "summary": "A comprehensive summary of the course content"
    }

    Keep all text in its original language (Hebrew or English).
    Ensure proper extraction of Hebrew text and names.
    If a field is not found, use null or empty array [].
    
    Document text:
    {text}
    """
    
    try:
        response = llm.complete(prompt.format(text=text[:15000]))  # Limit text length
        return json.loads(response.completion)
    except Exception as e:
        print(f"Error analyzing with Claude: {str(e)}")
        return {
            "course_name": "",
            "program_manager": "",
            "instructors": [],
            "summary": "Error analyzing content"
        }

def process_files(config):
    s3_client = init_aws_client(config)
    llm = init_claude(config)
    
    # List all files in the upload bucket/path
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(
        Bucket=config['aws']['upload_bucket_name'],
        Prefix=config['aws']['upload_path']
    )

    for page in pages:
        for obj in page.get('Contents', []):
            file_key = obj['Key']
            file_name = os.path.basename(file_key)
            file_ext = os.path.splitext(file_name)[1][1:].lower()
            
            if file_ext not in ['pdf', 'jpg', 'jpeg', 'png']:
                continue

            print(f"Processing file: {file_name}")
            
            # Download file
            temp_path = download_file(
                s3_client, 
                config['aws']['upload_bucket_name'], 
                file_key
            )

            try:
                # Extract text based on file type
                if file_ext == 'pdf':
                    extracted_text = extract_text_from_pdf(temp_path)
                else:
                    extracted_text = extract_text_from_image_file(temp_path)
                
                # Analyze with Claude
                analysis = analyze_with_claude(llm, extracted_text)
                
                # Create result JSON
                result = {
                    "file_name": file_name,
                    "course_name": analysis.get("course_name", ""),
                    "program_manager": analysis.get("program_manager", ""),
                    "instructors": analysis.get("instructors", []),
                    "summary": analysis.get("summary", ""),
                    "text": extracted_text
                }

                # Ensure proper instructor structure
                for instructor in result["instructors"]:
                    if isinstance(instructor, dict):
                        instructor.setdefault("name", "")
                        instructor.setdefault("role", "")
                        instructor.setdefault("title", None)
                        instructor.setdefault("description", None)

                # Save to S3
                json_key = os.path.join(
                    config['aws']['extract_txt_path'],
                    f"{os.path.splitext(file_name)[0]}.json"
                )
                
                s3_client.put_object(
                    Bucket=config['aws']['txt_extract_bucket_name'],
                    Key=json_key,
                    Body=json.dumps(result, ensure_ascii=False, indent=2).encode('utf-8')
                )

                print(f"Successfully processed and saved: {file_name}")

            except Exception as e:
                print(f"Error processing file {file_name}: {str(e)}")
            
            finally:
                # Cleanup
                if os.path.exists(temp_path):
                    os.remove(temp_path)

def main():
    config = load_config()
    process_files(config)

if __name__ == "__main__":
    main()