In [1]:
import os
import io

import yaml
import json
from collections import OrderedDict

import boto3
from botocore.exceptions import NoCredentialsError, ClientError

import base64  # Added base64 import
from PIL import Image
import fitz  # PyMuPDF

import pytesseract
from bidi.algorithm import get_display  # For Hebrew RTL text display

from openai import OpenAI
from anthropic import Anthropic

In [2]:
print("Current working directory:", os.getcwd())

Current working directory: C:\github_repos\BIU_LLM_Project


In [3]:
def load_config(config_file):
    # Load configuration from the YAML file
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [4]:
def init_aws_client(config):
    """Initialize AWS S3 client"""
    return boto3.client(
        's3',
        aws_access_key_id=config['aws']['aws_access_key_id'],
        aws_secret_access_key=config['aws']['aws_secret_access_key'],
        region_name=config['aws']['region_name']
    )

In [5]:
def get_s3_public_url(bucket_name, file_key, region):
    """Generate S3 public URL"""
    return f"https://{bucket_name}.s3.{region}.amazonaws.com/{file_key}"

In [6]:
def ensure_tmp_dir():
    """Create and clean temporary directory"""
    tmp_dir = os.path.join(os.getcwd(), 'tmp')
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
    else:
        for filename in os.listdir(tmp_dir):
            file_path = os.path.join(tmp_dir, filename)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
            except Exception as e:
                print(f'Error deleting {file_path}: {e}')
    return tmp_dir

In [7]:
def extract_text_from_file(file_path):
    """
    Extract both regular text and text from images in an input file.
    """
    
    # Open the PDF document with PyMuPDF (fitz)
    doc = fitz.open(file_path)
    text_content = []  # List to store extracted text and image placeholders
    
    # Loop through each page in the document
    for page_num in range(len(doc)):
        page = doc[page_num]  # Access a specific page in the PDF
        
        # Extract regular text from the page and add to text content list
        text_content.append(page.get_text())
        
        # Extract all images on the page
        image_list = page.get_images()
        for img_index, img in enumerate(image_list):
            try:
                # Get the image reference number (xref) to retrieve the image data
                xref = img[0]
                base_image = doc.extract_image(xref)  # Extracts image data (binary format)
                image_bytes = base_image["image"]  # Retrieves raw image bytes
                
                # Convert the image bytes to a PIL Image object
                image = Image.open(io.BytesIO(image_bytes))
                
                # Use Tesseract OCR to extract text from the image
                # ocr_text = pytesseract.image_to_string(image)
                ocr_text = pytesseract.image_to_string(image, lang="heb+eng")
                
                # Append the extracted text to the content
                # text_content.append(f"[Image OCR Text {page_num + 1}-{img_index + 1}]:\n{ocr_text}")
                text_content.append(f"\n{ocr_text}")
                
            except Exception as e:
                # Print error message if image extraction or OCR fails
                print(f"Error processing image {img_index} from page {page_num}: {str(e)}")
    
    # Close the document to free up resources
    doc.close()

    # Join all text content (including extracted image text) into a single string for output
    extracted_output = "\n".join(text_content)  # Variable for the final output

    print(f"\n Extracted text and images contetnt \n {extracted_output}")  # Print the final output
    
    return extracted_output
    
   

In [8]:
def init_openai(config):
    """Initialize OpenAI API key from configuration."""
    client = OpenAI(api_key=config['openai']['chat_gpt_key'])
    return client

In [9]:
def analyze_with_openai_eng_new1(config, client, text):
    """Analyze document content using OpenAI's GPT-4o and return structured JSON data.
    
    Args:
        config (dict): Configuration dictionary containing OpenAI settings
        client: OpenAI client instance
        text (str): Input text to analyze
        
    Returns:
        dict: Structured JSON data containing analyzed course information
    """    
    try:
        # Validate input text as UTF-8
        if not isinstance(text, str):
            raise ValueError("Input text must be a UTF-8 encoded string.")
        text = text.encode('utf-8').decode('utf-8')  # Ensure UTF-8 compliance

        system_prompt = {
            "role": "system",
            "content": """
            You are a text extractor and analyzer top expert.
            The text is a course syllabus document that contains the course information.
            
            Your mission is to extract as much information as you can in the given text, and process it into a JSON format.
            Required JSON structure: {"full_text": "The syllabus complete extracted text after you organize it"}           
               
            HOW TO PROCESS THE FULL TEXT
            1. The text will start after @@@@
            2. Read the text very carefully.
            3. Text may be in (Hebrew, English or mixed).
            4. Preserve all language terms (Hebrew, English, or mixed) in their original language.
            5. DO NOT LIMIT RESPONSE LENGTH. WE NEED ALL AVAILABLE INFORMATION FROM THE SOURCE TEXT.
        
            6. For the full_text field, apply these transformations
               - Add topic sentences to each major section.
               - Add clear section headers in capital letters.
               - Normalize spacing and formatting.
               - End all sentences with proper punctuation.
               - Transform bullet points into complete sentences.
               - Create proper paragraphs from lists.
               - Add contextual transitions between sections.
               - Ensure hierarchical content structure.
               - Preserve important dates, numbers, and contact information.
               - Remove duplicate headers/footers.
               - Remove any text that says: "[Embedded Image ]" or similar.
               - Maintain consistent formatting throughout.

            7. Structure the text in these sections
               - COURSE NAME: extract the Course Name that may be in English or Hebrew, beginning with words like: ["Course", "Workshop", "Track Name", "קורס", "סדנה", "סדנא", "מסלול", "לימודי תעודה"]
               - COURSE OVERVIEW: course overview, or the program description.
               - TARGET AUDIENCE: if there are. If not found, skip this section
               - COURSE LEADERSHIP: Include all lecturers and leaders biography and their descriptions. Note for: Academic Director, Academic Leaders and their roles, Academic Advisors and Lecturers etc. If not found, skip this section.
               - PREREQUISITES: if there are. If not found, skip this section
               - CERTIFICATIONS: Create a comma delimited list for all certifications, awards achievements or recognitions. Look for certification terms like: ["certificates", "Certified", "cc"] if there are.
               - COURSE SCHEDULE: look for start date, end date, duration, academic hours, schedules. Be bullet focused.
               - LEARNING FORMAT: look for things like location, in class, remote or hybrid learning options, laboratories, practicum, staj, etc. - if there are: be bullet focused. If not found, skip this section.
               - TUITION AND FEES: look for registration fees, tuition, parking fees. If there are: be bullet focused. If not found, skip this section
               - COURSE TOPICS: Bring ALL learning topics that are in text. BE EXTREMELY DETAILED. DO NOT OMIT ANYTHING. Organize into topics and their bullets.
               - REGISTRATION AND CONTACT INFORMATION: if there are: be bullet focused. If not found, skip this section
               - INSTITUTIONAL INFORMATION: Focused institutional information. If there are: be focused. If not found, skip this section            

            8. Writing style guidelines
               - Use complete sentences.
               - Add context to technical terms.
               - Maintain consistent tense and voice.
               - Ensure clarity and readability.
               - Preserve all information from the given text.

            9. Formatting requirements
               - Use consistent paragraph spacing.
               - Ensure proper nesting of information.
               - Keep important identifiers (emails, URLs, phone numbers etc.) in their original format.
        
            Text to analyze after this @@@@
            """
        }        
                
        # Send prompt to OpenAI GPT for initial processing
        response = client.chat.completions.create(
            model=config['openai']['gpt_model'],  
            messages=[system_prompt, {"role": "user", "content": text}],
            max_tokens=config['rag']['max_output_tokens_rag'],
            temperature=config['rag']['temperature_rag']
        )
        
        # Extract response text from prompt01
        response_text1 = response.choices[0].message.content.strip()
        response_tokens1 = response.usage.completion_tokens

        print(f"\nLLM response_text1 {response_text1}")
       
        # Extract response JSON text from prompt01
        start = response_text1.find('{')
        end = response_text1.rfind('}') + 1
        
        if start != -1 and end != 0:
            json_str1 = response_text1[start:end]
            initial_data = json.loads(json_str1)          
            initial_data_full_text = initial_data.get("full_text", text)
        else:
            raise ValueError("No JSON found in response")

        # Second prompt for detailed extraction
        system_prompt = {
            "role": "system",
            "content": """           
            You are a text extractor and analyzer top expert.
            Your mission is to extract as much information as you can in the given text process it into a JSON format.
            The text is a course syllabus document that contains the course information.
            
            HOW TO PROCESS THE TEXT
            1. The text will start after @@@@
            2. Read the text very carefully.
            3. Text may be in (Hebrew, English or mixed).
            4. Preserve all language terms (Hebrew, English, or mixed) in their original language.
            5. DO NOT LIMIT RESPONSE LENGTH. WE NEED ALL AVAILABLE INFORMATION FROM THE SOURCE TEXT.
            
            WHAT INFORMATION TO EXTRACT
            1. The complete course name exactly as it appears
            
            2. From COURSE LEADERSHIP section:
               - Create a LIST of ALL instructors, including every teaching staff member mentioned in the text
               - For each instructor in the list, extract:
                 * Full name as written
                 * Their role/position (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך, מנהל אקדמי, מוביל אקדמי)
                 * Professional title (if mentioned)
                 * Any additional description or background information
               - Make sure to capture EVERY instructor mentioned, even if some details are missing
               - List format should handle multiple instructors
            
            3. From the entire text:
               - Create a comprehensive course summary
               - Do not use text from INSTITUTIONAL INFORMATION or motto.
            
            4. Return the information in this exact JSON format:
            {
                "course_name": "Complete course name as extracted",
                "instructors": [
                    {
                        "name": "First instructor name",
                        "role": "Role (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)",
                        "title": "Professional title if available",
                        "description": "Additional description"
                    }
                    // Continue for all instructors found
                ],
                "summary": "Comprehensive course summary"
            }
    
                Important notes for JSON output:
                1. Use null for missing fields
                2. Use empty array [] if no instructors are found          
                3. Maintain JSON list format even if only one instructor is found
    
            Text to analyze after this @@@@
            """
        }              
        
        # Send prompt02 to OpenAI GPT-4 for detailed extraction
        response = client.chat.completions.create(
            model=config['openai']['gpt_model'],  
            messages=[system_prompt, {"role": "user", "content": text}],            
            max_tokens=config['rag']['max_output_tokens_rag'],
            temperature=config['rag']['temperature_rag']
        )
        
        # Extract response JSON text from prompt02
        response_text2 = response.choices[0].message.content.strip()
        print(f"\nLLM response_text2 {response_text2}")
        
        extract_start = response_text2.find('{')
        extract_end = response_text2.rfind('}') + 1
        
        if extract_start != -1 and extract_end != 0:
            extracted_data = json.loads(response_text2[extract_start:extract_end])
        else:
            extracted_data = {
                "course_name": None,
                "instructors": [],
                "summary": None
            }

        # Create merged JSON output
        merged_json = {
            "course_name": extracted_data.get("course_name"),
            "instructors": extracted_data.get("instructors", []),
            "summary": extracted_data.get("summary"),
            "full_text": initial_data_full_text,
            "full_text_no_tokens": response_tokens1
        }

        # Format merged JSON as a readable string for Hebrew
        readable_json = json.dumps(merged_json, ensure_ascii=False, indent=2)
        
        return readable_json
   
    except (json.JSONDecodeError, ValueError) as e:
        print(f"JSON Parse Error: {str(e)}")
        print(f"LLM Response text: {response_text1 if 'response_text1' in locals() else response_text2}")
        return json.dumps({
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0
        }, ensure_ascii=False, indent=2)
         
    except Exception as e:
        print(f"Error analyzing with OpenAI: {str(e)}")        
        return json.dumps({
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0
        }, ensure_ascii=False, indent=2)

In [10]:
def analyze_with_openai_eng(config, client, text):
    """Analyze document content using OpenAI's GPT-4o and return structured JSON data.
    
    Args:
        config (dict): Configuration dictionary containing OpenAI settings
        client: OpenAI client instance
        text (str): Input text to analyze
        
    Returns:
        dict: Structured JSON data containing analyzed course information
    """    
    try:
        # Validate input text as UTF-8
        if not isinstance(text, str):
            raise ValueError("Input text must be a UTF-8 encoded string.")
        text = text.encode('utf-8').decode('utf-8')  # Ensure UTF-8 compliance
       
        system_prompt = {
            "role": "system",
            "content": """
            You are a text extractor and analyzer top expert.
            The text is a course syllabus document that contains the course information.
            
            Your mission is to extract as much information as you can in the given text, and process it into a JSON format.
            Required JSON structure: {"full_text": "The sylabus complete extracted text after you orgenize it"}           
               
            HOW TO PROCESS THE FULL TEXT
            1. The text will start after @@@@
            2. Read the text very carefully.
            3. Text may be in (Hebrew, English or mixed).
            4. Preserve all language terms (Hebrew, English, or mixed) in their original language.
            5. DO NOT LIMIT RESPONSE LENGTH. WE NEED ALL AVAILABLE INFORMATION FROM THE SOURCE TEXT.
        
            6. For the full_text field, apply these transformations
               - Add topic sentences to each major section.
               - Add clear section headers in capital letters.
               - Normalize spacing and formatting.
               - End all sentences with proper punctuation.
               - Transform bullet points into complete sentences.
               - Create proper paragraphs from lists.
               - Add contextual transitions between sections.
               - Ensure hierarchical content structure.
               - Preserve important dates, numbers, and contact information.
               - Remove duplicate headers/footers.
               - Remove any text that says: "[Embedded Image ]" or similar.
               - Maintain consistent formatting throughout.

            7. Structure the text in these sections
               - COURSE NAME: extract the Course Name that may in English or Hebrew, begining with words like: ["Course" , "Workshop" , "Track Name", "קורס" , "סדנה" , "סדנא", "מסלול" , "לימודי תעודה"]
               - COURSE OVERVIEW: course overview, or the program description.
               - TARGET AUDIENCE: if there are.if not found, skip this section
               - COURSE LEADERSHIP: Include all lecturers and leaders biography and their descriptions. Note for: Academic Director, Academic Leaders and their roles, Academic Advisors and Lecturers etc. if not found, skip this section.
               - PREREQUISITES: if there are. if not found, skip this section
               - CERTIFICATIONS: Create a comma delimited list for all certifications, awards acheivments or recognitions. Look for certification terms like: ["certificates" , "Certified" "cc"] if there are.
               - COURSE SCHEDULE: look for start date, end date, duration, accademic hours, schedules. be bullet focused.
               - LEARNING FORMAT: look for things like location, in class, remote or hybrid learning options, laboratories, practicum, staj, etc. - if there are: be bullet focused. if not found, skip this section.
               - TUITION AND FEES: look for registration fees, tuition, parking fess. if there are: be bullet focused. if not found, skip this section
               - COURSE TOPICS: Bring ALL learning topics that in text. BE EXTREAMLY DETAILED. DO NOT OMIT ANYTHING. Orgenize to topics and their bullets.
               - REGISTRATION AND CONTACT INFORMATION: if there are: be bullet focused. if not found, skip this section
               - INSTITUTIONAL INFORMATION: Focused institutional information. if there are: be focused. if not found, skip this section            

            8. Writing style guidelines
               - Use complete sentences.
               - Add context to technical terms.
               - Maintain consistent tense and voice.
               - Ensure clarity and readability.
               - Preserve all information from the given text.

            9. Formatting requirements
               - Use consistent paragraph spacing.
               - Ensure proper nesting of information.
               - Keep important identifiers (emails, URLs, phone numbers etc.) in their original format.
        
            Text to analyze after this @@@@
            """
        }        
                
        # Send prompt to OpenAI GPT for initial processing
        response = client.chat.completions.create(
            model=config['openai']['gpt_model'],  
            messages=[system_prompt, {"role": "user", "content": text}],
            max_tokens=config['rag']['max_output_tokens_rag'],
            temperature=config['rag']['temperature_rag']
        )
        
        # Extract response text from prompt01
        response_text1   = response.choices[0].message.content.strip()
        response_tokens1 = response.usage.completion_tokens

        print (f"\n LLM response_text1 {response_text1}")
       
        # Extract response JSON text from prompt01
        start = response_text1.find('{')
        end = response_text1.rfind('}') + 1
        
        if start != -1 and end != 0:
            json_str1 = response_text1[start:end]
            initial_data = json.loads(json_str1)          
            initial_data_full_text = initial_data.get("full_text", text)
        else:
            raise ValueError("No JSON found in response")

        # Second prompt for detailed extraction
        system_prompt = {
            "role": "system",
            "content": """           
            You are a text extractor and analyzer top expert.
            Your mission is to extract as much information as you can in the given text process it into a JSON format.
            The text is a course syllabus document that contains the course information.
            
            HOW TO PROCESS THE TEXT
            1. The text will start after @@@@
            2. Read the text very carefully.
            3. Text may be in (Hebrew, English or mixed).
            4. Preserve all language terms (Hebrew, English, or mixed) in their original language.
            5. DO NOT LIMIT RESPONSE LENGTH. WE NEED ALL AVAILABLE INFORMATION FROM THE SOURCE TEXT.
            
            WHAT INFORMATION TO EXTRACT
            1. The complete course name exactly as it appears
            
            2. From COURSE LEADERSHIP section:
               - Create a LIST of ALL instructors, including every teaching staff member mentioned in the text
               - For each instructor in the list, extract:
                 * Full name as written
                 * Their role/position (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך, מנהל אקדמי, מוביל אקדמי)
                 * Professional title (if mentioned)
                 * Any additional description or background information
               - Make sure to capture EVERY instructor mentioned, even if some details are missing
               - List format should handle multiple instructors
            
            3. From the entire text:
               - Create a comprehensive course summary
               - Do not use text from INSTITUTIONAL INFORMATION or motto.
            
            4. Return the information in this exact JSON format:
            {
                "course_name": "Complete course name as extracted",
                "instructors": [
                    {
                        "name": "First instructor name",
                        "role": "Role (e.g., יועץ מקצועי, מרצה, מרצה בכיר, מדריך)",
                        "title": "Professional title if available",
                        "description": "Additional description"
                    }
                    // Continue for all instructors found
                ],
                "summary": "Comprehensive course summary"
            }
    
                Important notes for JSON output:
                1. Use null for missing fields
                2. Use empty array [] if no instructors are found          
                3. Maintain JSON list format even if only one instructor is found
    
            Text to analyze after this @@@@
            """
        }              
        
        # Send prompt02 to OpenAI GPT-4 for detailed extraction
        response = client.chat.completions.create(
            model=config['openai']['gpt_model'],  
            messages=[system_prompt, {"role": "user", "content": text}],            
            max_tokens=config['rag']['max_output_tokens_rag'],
            temperature=config['rag']['temperature_rag']
        )
        
        # Extract response JSON text from prompt02
        response_text2 = response.choices[0].message.content.strip()
        print (f"\n LLM response_text2 {response_text2}")
        
        extract_start = response_text2.find('{')
        extract_end = response_text2.rfind('}') + 1
        
        if extract_start != -1 and extract_end != 0:
            extracted_data = json.loads(response_text2[extract_start:extract_end])
        else:
            extracted_data = {
                "course_name": None,
                "instructors": [],
                "summary": None
            }

        # Create merged JSON output
        merged_json = {
            "course_name": extracted_data.get("course_name"),
            "instructors": extracted_data.get("instructors", []),
            "summary": extracted_data.get("summary"),
            #"full_text": initial_data.get("full_text", text),
            "full_text": initial_data_full_text,
            "full_text_no_tokens": response_tokens1
        }
        return merged_json
        
        # Format merged JSON as a readable string for Hebrew
        # readable_json = json.dumps(merged_json, ensure_ascii=False, indent=2)
        # return readable_json
   
    except (json.JSONDecodeError, ValueError) as e:
        print(f"JSON Parse Error: {str(e)}")
        print(f"LLM Response text: {response_text1 if 'response_text1' in locals() else response_text2}")
        return ({
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0})
        '''
        return json.dumps({
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0
        }, ensure_ascii=False, indent=2)
        '''
         
    except Exception as e:
        print(f"Error analyzing with OpenAI: {str(e)}")        
        return ({
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0})
        '''
        return json.dumps({
            "course_name": None,
            "instructors": [],
            "summary": None,
            "full_text": text,
            "full_text_no_tokens": 0
        }, ensure_ascii=False, indent=2)
        '''

In [11]:
def process_files(config):
    """Main processing function to handle file operations using AWS S3 and the Claude service."""
    
    # Initialize AWS S3 client using provided configuration
    s3_client = init_aws_client(config)

    # Ensure the existence of a temporary directory for intermediate file storage
    tmp_dir = ensure_tmp_dir()  
           
    # Set up the JSON output directory path in the 'data/documents' folder relative to the current working directory
    json_output_dir_eng = os.path.join(os.getcwd(), 'data', 'documents', 'eng')
    os.makedirs(json_output_dir_eng, exist_ok=True)  

    # Set up the JSON output directory path in the 'data/documents' folder relative to the current working directory
    json_output_dir_heb = os.path.join(os.getcwd(), 'data', 'documents', 'heb')
    os.makedirs(json_output_dir_heb, exist_ok=True) 
    
    # Initialize Claude client using provided configuration
    # claude_client = init_claude(config)

    # Initialize OpenAi client using provided configuration
    openai_client = init_openai(config)

    # Define supported file types to process
    supported_file_types = tuple(config['local']['supported_file_types'])
    
    # List all files in AWS S3 upload bucket/path using the S3 paginator
    paginator = s3_client.get_paginator('list_objects_v2')
    pages = paginator.paginate(
        Bucket=config['aws']['sources_bucket_name'],
        Prefix=config['aws']['sources_path']
    )

    # Loop through all files in each page from the S3 bucket
    for page in pages:
        for obj in page.get('Contents', []):
            s3_file_key = obj['Key']
            file_name = os.path.basename(s3_file_key)
            
            # Process only supported file types
            if not file_name.lower().endswith(supported_file_types):
                continue

            print(f"\n=================================================================================")
            print(f"Start Processing file: {file_name}")
            
            # Create paths for the temporary PDF and output JSON
            tmp_download_file_path = os.path.join(tmp_dir, file_name)

            # Create output JSON english
            json_filename = f"{os.path.splitext(file_name)[0]}.json"
            local_json_path_eng = os.path.join(json_output_dir_eng, json_filename)

            # Create output JSON hebrew
            json_filename = f"{os.path.splitext(file_name)[0]}.json"
            local_json_path_heb = os.path.join(json_output_dir_heb, json_filename)
            
            try:
                # Download file to temporary location
                s3_client.download_file(
                    config['aws']['sources_bucket_name'], s3_file_key,
                    tmp_download_file_path
                )
                
                # Extract text from the file (including image references)
                extracted_text = extract_text_from_file(tmp_download_file_path)
                # print(f"\nExtracted Text:\n{extracted_text}")

                # Analyze text with OpenAi
                analysis_eng = analyze_with_openai_eng(config, openai_client, extracted_text)
                
                print(f"\nOpenAi Final Analysis Eng:\n{analysis_eng}")                      

                # Save analysis to JSON output file
                with open(local_json_path_eng, 'w', encoding="utf-8") as json_file:
                    json.dump(analysis_eng, json_file, ensure_ascii=False, indent=2)

                print(f"Saved analysis to {local_json_path_eng}")

                '''
                # Analyze text with Claude - english
                analysis_eng = analyze_with_claude_eng(config, claude_client, extracted_text)
                print(f"\nClaude Final Analysis Eng:\n{analysis_eng}")

                # Save analysis to JSON output file
                with open(local_json_path_eng, 'w') as json_file:
                    json.dump(analysis_eng, json_file)

                print(f"Saved analysis to {local_json_path_eng}")
                
                
                # Analyze text with Claude - hebrew
                analysis_heb = analyze_with_claude_heb(config, claude_client, extracted_text)
                print(f"\nClaude Final Analysis Heb:\n{analysis_heb}")

                # Save analysis to JSON output file
                with open(local_json_path_heb, 'w') as json_file:
                    json.dump(analysis_heb, json_file)

                print(f"Saved analysis to {local_json_path_heb}")
                '''
                
            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
                raise



In [12]:
def main():
    """Main entry point"""
    try:
        config_file = 'config/config.yaml'
        config = load_config(config_file)

        process_files(config)
    
    except Exception as e:
        print(f"Main process error: {str(e)}")
        raise

if __name__ == "__main__":
    main()


Start Processing file: AWS Practitioner Sylabus.pdf

 Extracted text and images contetnt 
 b i u . a c . i l
.משפיעים על המחר, היום
 AWS Cloud
Practitioner
:נושאי הקורס
Define what the AWS Cloud is and 
the basic global infrastructure
Describe basic AWS Cloud 
architectural principles
Describe the AWS Cloud value 
proposition
Describe key services on the AWS 
platform and their common use 
cases (for example, compute and 
analytics)
Describe basic security and 
compliance aspects of the AWS 
platform and the shared security 
model
Define the billing, account 
management, and pricing models
Identify sources of documentation 
or technical assistance
Describe basic/core characteristics 
of deploying and operating in the 
AWS Cloud
29.11.2024  :תאריך פתיחה
21:30-17:30 בימי שלישי בין השעות
13:00-09:00 בימי שישי בין השעות
 שעות אקדמיות)50(  מפגשים10 במשך
 בית הספר להייטק וסייבר
אילן מזמין אתכם -של אוניברסיטת בר
AWS ללמוד את טכנולוגיות הענן של
 הקורס היברידי: הלימודים יתקיימו בקמפוס
אילן בר״