#### **1. IMPORTING LIBRARIES**

In [1]:
# !pip install -q -U google-genai
import base64
from openai import OpenAI
import os
import json
import time
import requests
import ollama
import PIL.Image
import pandas as pd
from google import genai
from google.genai import types
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#### **2. IMAGES GATHERING**

In [None]:
# Initialize the WebDriver with the provided download directory
def setup_driver(download_dir):
    """
    Sets up an Edge WebDriver with a specified download directory.
    """
    edge_options = Options()
    prefs = {
        "download.default_directory": download_dir,  # Set custom download directory
        "download.prompt_for_download": False,       # Disable download prompts
        "safebrowsing.enabled": True                 # Disable Safe Browsing checks
    }
    edge_options.add_experimental_option("prefs", prefs)
    
    # Provide the path to your EdgeDriver executable
    service = Service(executable_path="msedgedriver.exe")
    driver = webdriver.Edge(service=service, options=edge_options)
    return driver

dir = r"D:\Work\Courses_Analyzer"
driver = setup_driver(dir)

# Open the webpage
driver.get("https://uitu.edu.pk/short-courses")

#Load all courses on the webpage
while True:
    try:
        load_more = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//*[@id="eael-filter-gallery-wrapper-e5b5a9b"]/div[3]/a')))
        load_more.click()
        time.sleep(5)
    except:
        break

# Locate the parent div that contains all the course images
parent_div = driver.find_element(By.XPATH, '//*[@id="eael-filter-gallery-wrapper-e5b5a9b"]/div[2]')

# Find all image elements within the parent div
image_elements = parent_div.find_elements(By.XPATH, './/div/div/div/img')

# Create a directory to save the images
if not os.path.exists('course_images'):
    os.makedirs('course_images')

# Loop through each image element, extract the src attribute, and download the image
for i, img in enumerate(image_elements):
    img_url = img.get_attribute('src')
    if img_url:
        # Download the image
        response = requests.get(img_url)
        if response.status_code == 200:
            with open(f'course_images/course_{i+1}.jpg', 'wb') as file:
                file.write(response.content)
                print(f'Downloaded course_{i+1}.jpg')
        else:
            print(f'Failed to download image {i+1}')
    else:
        print(f'No src attribute found for image {i+1}')

# Close the WebDriver
driver.quit()

#### **3. GEMINI INIALIZATION AND PROMPTENGINEERING**

In [None]:
client_openai = OpenAI(api_key="YOUR_API_KEY")
client_gemini = genai.Client(api_key="YOUR_API_KEY")

In [5]:
import json

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

def extract_course_data(image_path):
#     prompt = """**Role**: You are a Senior Academic Data Extraction Specialist with 20+ years of experience in educational content analysis. 

# **Task**: Analyze this course advertisement image and extract structured information with 99.9 percent accuracy. 

# **Required Fields**:
# 1. "Course name": [Official program title, exclude institution name]
# 2. "Course duration": [Total length in weeks/months, convert all to weeks. Mention the duration type (e.g. weeks/days)]
# 3. "Course start date": [YYYY-MM-DD format, infer if "Starting soon" mentioned]
# 4. "Class day": [Comma-separated weekday names from schedule grid]
# 5. "Course fee": [Numerical value only]
# 6. "Fee type": [Only: "One-time", "Monthly", "Weekly", or "Other"]
# 7. "Course outline": [All relevent key modules/topics from curriculum section/outline/headlines]

    # 7. "Course Outline / Summary":  
    # - Extract the key modules, topics, or curriculum headlines if explicitly provided (e.g., bullet points, labeled sections).  
    # - If no bullet points or module list are given, but there is descriptive text about the course's goals or content, include that text as the outline.  
    # - If there is no relevant descriptive text, return null.  

# **Format Rules**:
# - Return VALID JSON only
# - Clean text artifacts (watermarks, logos)
# - Resolve date ambiguities using current year
# - Normalize currency
# - If multiple pricing options, choose standard fee
# - Not compulsory that all fields will be present
# - If any field is not present, return null

# **Image Context**:
# - Academic brochure/flyer format
# - Typical sections: Title, Schedule, Fees, Curriculum
# - Common abbreviations: "wk" = week, "mo" = month

# **Output**:"""

    prompt = f"""**Role**: You are a Senior Academic Data Extraction Specialist with 20+ years of experience in educational content analysis.
    
    **Task**: Analyze the following course advertisement image text and extract structured information with 99.9% accuracy. Do not infer or hallucinate any data beyond what is explicitly provided. If a field is ambiguous or missing, set its value to null.
    **Required Fields**:
    1. "Course name": Extract the official program title only (exclude any institution names).
    2. "Course duration": Extract the total length and its unit exactly as given. 
    3. "Course start date": Extract the start date in YYYY-MM-DD format.  
        - Otherwise, if no clear start date is provided, return null.
    4. "Class day": Extract the weekday names (e.g., "Monday, Wednesday") from the schedule grid as a comma-separated string.
    5. "Course fee": Extract only the numerical value of the fee. Remove any currency symbols or non-numeric characters.
    6. "Fee type": Extract exactly one of the following values based on the text: "One-time", "Monthly", "Weekly", or "Free". If none is explicitly mentioned, return null.
    7. "Course Outline / Summary": All relevent key modules/topics from curriculum section/outline/headlines. iF no bullet points or module list are given, but there is descriptive text about the course's goals or content, include that text as the outline. If there is no relevant descriptive text, return null.


    **Additional Format Rules**:
    - Output only a valid JSON object with the keys listed above.
    - If a field is ambiguous or not present, return null for that field.
    - Clean any text artifacts, watermarks, or logos from the image.
    - Remove any irrelevant information not related to the course content.
    - Dont include any information that is not explicitly mentioned in the image.
    - Remove any special characters or symbols from the extracted text.

    **Image Context**:
    - Academic brochure/flyer format.
    - Common sections: Title, Schedule, Fees, Curriculum.


    **Output**:
    """

    img = PIL.Image.open(image_path)
    response = client_gemini.models.generate_content(
        model="gemini-1.5-flash",
        contents=[prompt, img]
    )

    # # Getting the Base64 string
    # base64_image = encode_image(image_path)

    # response = client_openai.chat.completions.create(
    #     model="gpt-4o-mini",
    #     messages=[
    #         { "role": "system", "content": prompt },
    #         {
    #             "role": "user",
    #             "content": [
    #                 {
    #                     "type": "text",
    #                     "text": "Please extract the following information from the course advertisement image:",
    #                 },
    #                 {
    #                     "type": "image_url",
    #                     "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
    #                 },
    #             ],
    #         }
    #     ],
    # )


    try:
        # return json.loads(response.choices[0].message.content.replace('```json', '').replace('```', '').strip())
        return json.loads(response.text.replace('```json', '').replace('```', '').strip())
    except json.JSONDecodeError:
        print("Error parsing JSON, implementing fallback...")
        return {
            "Course name": None,
            "Course duration": None,
            "Course start date": None,
            "Class day": None,
            "Course fee": None,
            "Fee type": None,
            "Course Outline / Summary": None
        }

#### **4. DATA INSERTION TO DATAFRAME**

In [6]:
def process_course_images_to_df(image_folder='course_images', save_df=None):
    """
    Process course images to DataFrame with structured course information.
    
    Parameters:
    - image_folder (str): Path to directory containing course images
    - save_df (str): Optional filename to save final DataFrame (e.g., 'courses.csv')
    
    Returns:
    - pd.DataFrame: DataFrame containing extracted course information
    """
    
    # Initialize DataFrame with proper schema
    columns = [
        "Course name",
        "Course duration",
        "Course start date",
        "Class day",
        "Course fee",
        "Fee type",
        "Course Outline / Summary"
    ]
    
    df = pd.DataFrame(columns=columns)
    
    # Supported image formats
    valid_extensions = {'.jpg', '.jpeg', '.png'}
    
    try:
        # Process each image in directory
        for filename in os.listdir(image_folder):
            filepath = os.path.join(image_folder, filename)
            
            if os.path.splitext(filename)[1].lower() in valid_extensions:
                try:
                    # Get JSON data from previous extraction function
                    course_data = extract_course_data(filepath)
                    
                    # Ensure all keys exist in case of partial extraction
                    complete_data = {
                        key: course_data.get(key, None)
                        for key in columns
                    }

                    #Add source image path
                    complete_data['Source Image'] = filepath

                    print(complete_data)
                    
                    # Append to DataFrame
                    df = pd.concat(
                        [df, pd.DataFrame([complete_data])],
                        ignore_index=True
                    )
                    
                    print(f"Processed: {filename}")
                    
                except Exception as e:
                    print(f"Error processing {filename}: {str(e)}")
                    continue
            # Wait for 2 seconds to avoid rate limiting
            time.sleep(2)
                    
        # Post-processing
        if not df.empty:
            # Convert date column
            df['Course start date'] = pd.to_datetime(
                df['Course start date'],
                errors='coerce'
            )
            
            # Convert fee to numeric
            df['Course fee'] = pd.to_numeric(
                df['Course fee'],
                errors='coerce'
            )
          
            
            # Save if requested
            if save_df:
                df.to_csv(save_df, index=False)
                print(f"Data saved to {save_df}")
                
    except Exception as e:
        print(f"Critical error: {str(e)}")
        
    return df

# df = process_course_images_to_df(image_folder='course_images', save_df='courses.csv')
df = process_course_images_to_df(image_folder='test_courses',save_df='courses_extracted_data.csv')
df

{'Course name': 'WOMEN LEADERSHIP DEVELOPMENT PROGRAM', 'Course duration': '5 Days', 'Course start date': '2025-02-17', 'Class day': None, 'Course fee': None, 'Fee type': None, 'Course Outline / Summary': 'A women in leadership is a powerful force for change empowering others to thrive and driving organizations towards greater equity innovation and success', 'Source Image': 'test_courses\\course_1.jpg'}
Processed: course_1.jpg
{'Course name': 'Certified Mental Health Counselor', 'Course duration': '3-Month Duration', 'Course start date': '2024-10-12', 'Class day': 'Saturday', 'Course fee': '20000', 'Fee type': 'One-time', 'Course Outline / Summary': None, 'Source Image': 'test_courses\\course_10.jpg'}
Processed: course_10.jpg
{'Course name': 'amazon WHOLESALE FBA', 'Course duration': '2-Month', 'Course start date': '2024-10-13', 'Class day': 'Sunday', 'Course fee': '12000', 'Fee type': 'One-time', 'Course Outline / Summary': 'Amazon Basics, Product Analysis, How to source products, Shi

Unnamed: 0,Course name,Course duration,Course start date,Class day,Course fee,Fee type,Course Outline / Summary,Source Image
0,WOMEN LEADERSHIP DEVELOPMENT PROGRAM,5 Days,2025-02-17,,,,A women in leadership is a powerful force for ...,test_courses\course_1.jpg
1,Certified Mental Health Counselor,3-Month Duration,2024-10-12,Saturday,20000.0,One-time,,test_courses\course_10.jpg
2,amazon WHOLESALE FBA,2-Month,2024-10-13,Sunday,12000.0,One-time,"Amazon Basics, Product Analysis, How to source...",test_courses\course_11.jpg
3,GRAPHIC DESIGNER,,NaT,,,Free,"Poster Making, Flyers Design, Logo Design, Bus...",test_courses\course_12.jpg
4,DIGITAL MARKETING (with AI Tools),3-Month,NaT,Sunday,15000.0,One-time,"Social Media Marketing, Google Ads, YouTube Ma...",test_courses\course_13.jpg
5,GENERATIVE AI PROMPT ENGINEERING,2-Month,NaT,,16000.0,One-time,"Dive into the world of Generative AI,Master th...",test_courses\course_14.jpg
6,Data Visualization and Analysis in Tableau,2-Month,NaT,,16000.0,One-time,Master the art of data visualization & analysi...,test_courses\course_15.jpg
7,SOCIAL MEDIA MARKETING,2-Month,2024-09-07,Saturday,10000.0,One-time,"Facebook Ads, Instagram Ads, Linkedin Optimiza...",test_courses\course_16.jpg
8,PYTHON PROGRAMMING BASIC TO ADVANCE,3-Month,NaT,Saturday,18000.0,One-time,"Python Fundamentals, Python Intermediate Level...",test_courses\course_17.jpg
9,PROFESSIONAL CYBER SECURITY JOB TRAINING - Lev...,,2024-08-11,Sunday,,,,test_courses\course_18.jpg
