In [None]:
import os
import json
import anthropic

class DirectCourseExtractor:
    def __init__(self, api_key: str):
        """Initialize the DirectCourseExtractor with Anthropic API key."""
        self.client = anthropic.Client(api_key=api_key)
        self.model = "claude-3-sonnet-20240229"

    def extract_course_info(self, pdf_path: str) -> dict:
        """Extract course information by sending PDF content directly to Claude."""
        try:
            # Read PDF file content
            with open(pdf_path, 'rb') as file:
                pdf_content = file.read()
            
            # Create a message with the PDF content
            message = self.client.messages.create(
                model=self.model,
                max_tokens=4096,
                temperature=0,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": """Analyze this document and extract the following information in JSON format. 
                                The document contains course syllabus information, likely in both Hebrew and English. 
                                Required JSON structure: {
                                    "course_name": "Course name in original language",
                                    "program_manager": "Look for 'מנהל התוכנית' or program manager",
                                    "instructors": [{
                                        "name": "Instructor name",
                                        "role": "Role (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)",
                                        "title": "Professional title if available",
                                        "description": "Additional description or background"
                                    }],
                                    "summary": "A comprehensive summary of the course content",
                                    "embedded_images": ["List of image references"],
                                    "full_text": "The complete text from the document"
                                }
                                Keep all text in its original language (Hebrew and English).
                                If a field is not found, use null or empty array []."""
                            },
                            {
                                "type": "file",
                                "source": {
                                    "type": "base64",
                                    "media_type": "application/pdf",
                                    "data": pdf_content.hex()
                                }
                            }
                        ]
                    }
                ]
            )
            
            # Parse JSON response
            response_text = message.content[0].text
            try:
                # Find JSON content within the response
                json_start = response_text.find('{')
                json_end = response_text.rfind('}') + 1
                json_str = response_text[json_start:json_end]
                course_info = json.loads(json_str)
                return course_info
            except json.JSONDecodeError as e:
                raise Exception(f"Error parsing JSON response: {str(e)}")
                
        except Exception as e:
            raise Exception(f"Error extracting course information: {str(e)}")

def main():
    # Get API key from environment variable
    api_key = os.getenv('ANTHROPIC_API_KEY')
    if not api_key:
        raise ValueError("Please set the ANTHROPIC_API_KEY environment variable")

    # Initialize extractor
    extractor = DirectCourseExtractor(api_key)

    # Process PDF file
    pdf_path = "course_syllabus.pdf"  # Replace with your PDF path
    try:
        course_info = extractor.extract_course_info(pdf_path)
        
        # Save results to JSON file
        output_file = "course_info.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(course_info, f, ensure_ascii=False, indent=2)
        
        print(f"Course information extracted and saved to {output_file}")
        print("\nExtracted information:")
        print(json.dumps(course_info, ensure_ascii=False, indent=2))
        
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()