### Gemini 2.0 flash lite VS Tuned Gemini 2.0 flash lite

In [13]:
# Cell 1: Imports and Setup
import os
import sys
import logging
import time
import json
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, Markdown # For displaying DataFrames and Markdown
import vertexai # Import base vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason, Image as VertexImage # Specific imports
from google.api_core import exceptions as google_exceptions # For error handling
import inspect # To inspect function signature
# Import List and Dict for type hinting if needed, though often optional in notebooks
from typing import List, Dict, Any

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.getLogger("google.api_core").setLevel(logging.WARNING)
logging.getLogger("google.auth").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("PIL").setLevel(logging.WARNING)

# --- Project Path Setup ---
# Notebook is in the project root directory
project_root = os.path.abspath('.')
src_path = os.path.join(project_root, 'src')

print(f"DEBUG: Project root: {project_root}")
print(f"DEBUG: Src path: {src_path}")

if os.path.isdir(src_path):
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
        print(f"Added {src_path} to sys.path")
    else:
        print(f"{src_path} already in sys.path")
else:
    print(f"ERROR: src directory not found at {src_path}. Cannot import project modules.")
    raise FileNotFoundError(f"src directory not found at {src_path}")

# --- Import project modules ---
try:
    import config
    import utils
    # Import the specific functions needed from vllm_handler
    from vllm_handler import analyze_content, initialize_vertex_ai
    print("Project modules (config, utils, vllm_handler) imported successfully.")

    # Verify the handler has the modified function signature
    sig = inspect.signature(analyze_content)
    if 'model_id_override' not in sig.parameters:
         print("\nCRITICAL WARNING: vllm_handler.analyze_content is missing the 'model_id_override' parameter!")
         print("Please ensure src/vllm_handler.py is saved with the correct version and RESTART THE KERNEL.")
         # raise AttributeError("analyze_content function signature is incorrect.")
    else:
         print("Verified: vllm_handler.analyze_content has 'model_id_override' parameter.")

except ImportError as e:
    print(f"Error importing project modules from {src_path}: {e}")
    raise
except Exception as e:
    print(f"An unexpected error occurred loading config/utils: {e}")
    raise

print("\nSetup cell execution complete.")

DEBUG: Project root: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON
DEBUG: Src path: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/src
/home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/src already in sys.path
Project modules (config, utils, vllm_handler) imported successfully.
Verified: vllm_handler.analyze_content has 'model_id_override' parameter.

Setup cell execution complete.


In [14]:
# Cell 2: Configuration & Helper Functions (for Base vs Tuned Comparison)

# --- Model Configuration ---
BASE_MODEL_ID = "gemini-2.0-flash-lite-001"
# Your specific fine-tuned model ID
TUNED_MODEL_ID = "projects/248124319532/locations/europe-west4/models/8219698240602243072"

# --- Input/Output Configuration ---
# Construct paths relative to project_root (defined in Cell 1)
OUTPUT_DIR_PATH = os.path.join(project_root, 'outputs')
INPUT_IMAGE_DIR_PATH = os.path.join(project_root, 'inputs', 'jpeg') # Use JPEG input path
IMAGE_EXTENSIONS = {".jpg", ".jpeg"} # Focus on jpeg/jpg for this run

COMPARISON_OUTPUT_FILENAME = "comparison_base_vs_tuned_results.json" # Specific filename
GRAPH_OUTPUT_FILENAME = "comparison_base_vs_tuned_graph.png"
COMPARISON_TABLE_FILENAME = "comparison_base_vs_tuned_table.csv"

# --- Test Prompt ---
TEST_PROMPT = "Describe this image briefly, focusing on any text present."

# --- Debugging Print ---
print(f"DEBUG: Using Input Image Directory: {INPUT_IMAGE_DIR_PATH}")
# ---

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR_PATH, exist_ok=True)

# Store results for this specific comparison run
comparison_results_list = [] # Use this list within this notebook run
base_model_times = []
tuned_model_times = []

# --- Helper Functions ---

def get_image_files_in_dir(image_input_dir: str) -> List[str]:
    """Gets a list of absolute paths for specified IMAGE files in the directory."""
    files = []
    abs_image_input_dir = os.path.abspath(image_input_dir)
    print(f"DEBUG: [get_image_files_in_dir] Checking for images in: {abs_image_input_dir}")

    if not os.path.isdir(abs_image_input_dir):
        logging.error(f"Image input directory not found: {abs_image_input_dir}")
        print(f"ERROR: [get_image_files_in_dir] Image input directory not found: {abs_image_input_dir}")
        return files

    logging.info(f"Scanning for image files in: {abs_image_input_dir}")
    found_count = 0
    try:
        for filename in os.listdir(abs_image_input_dir):
            if filename.startswith('.'): continue
            _, file_extension = os.path.splitext(filename.lower())
            if file_extension in IMAGE_EXTENSIONS: # Check against specified extensions
                full_path = os.path.join(abs_image_input_dir, filename)
                if os.path.isfile(full_path):
                    files.append(os.path.abspath(full_path))
                    found_count += 1
                    logging.debug(f"Found image file: {full_path}")
    except Exception as e:
        logging.error(f"Error listing directory {abs_image_input_dir}: {e}")
        print(f"ERROR: [get_image_files_in_dir] Error listing directory {abs_image_input_dir}: {e}")
        return []

    logging.info(f"Found {found_count} image files in {abs_image_input_dir}.")
    print(f"DEBUG: [get_image_files_in_dir] Found {found_count} image files.")
    return files

# --- analyze_content_wrapper (uses the imported vllm_handler.analyze_content) ---
# This wrapper simplifies calling the function from the main loop and handles timing
def analyze_content_wrapper(file_path: str, model_to_use: str) -> tuple[str, float | None]:
    """
    Calls the main analysis function from vllm_handler with timing.
    Returns the result string and duration in seconds (or None if error).
    """
    start_time = time.time()
    try:
        # Ensure Vertex AI is initialized (should be done in Cell 4, but safe to check)
        if not initialize_vertex_ai():
            return "Error: Vertex AI Not Initialized", None

        # Call the imported analyze_content function from vllm_handler
        result_str = analyze_content(file_path, model_id_override=model_to_use)
        end_time = time.time()
        duration = end_time - start_time
        # Check if the result indicates an error from within analyze_content
        if result_str.startswith("Error:"):
             return result_str, None # Return error string, no valid duration
        else:
             return result_str, duration # Return success string and duration
    except Exception as e:
        logging.error(f"Unexpected error calling analyze_content for {os.path.basename(file_path)} with model {model_to_use}: {e}", exc_info=True)
        return f"Error: Wrapper exception - {e}", None


print("\nConfiguration and Helper Functions cell execution complete.")

DEBUG: Using Input Image Directory: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/inputs/jpeg

Configuration and Helper Functions cell execution complete.


In [15]:
# --- Cell 3: Get Image Files ---

# Use the IMAGE helper function and the specific IMAGE input path (defined in Cell 2)
image_files_to_test = get_image_files_in_dir(INPUT_IMAGE_DIR_PATH)

if not image_files_to_test:
    raise FileNotFoundError(f"No JPEG/JPG image files found in {INPUT_IMAGE_DIR_PATH}. Please add files to the '{INPUT_IMAGE_DIR_PATH}' directory.")
else:
    print(f"\nFound {len(image_files_to_test)} image files for comparison.")

2025-05-03 13:50:55,382 - INFO - Scanning for image files in: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/inputs/jpeg
2025-05-03 13:50:55,383 - INFO - Found 5 image files in /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/inputs/jpeg.


DEBUG: [get_image_files_in_dir] Checking for images in: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/inputs/jpeg
DEBUG: [get_image_files_in_dir] Found 5 image files.

Found 5 image files for comparison.


In [16]:
# --- Cell 4: Initialize Vertex AI ---

# Initialize Vertex AI SDK using the function from vllm_handler (imported in Cell 1)
if initialize_vertex_ai():
    print("Vertex AI SDK initialized successfully for comparison.")
else:
    print("ERROR: Vertex AI SDK initialization failed. Cannot proceed.")
    # raise RuntimeError("Vertex AI SDK failed to initialize.")

Vertex AI SDK initialized successfully for comparison.


In [17]:
# --- Cell 5: Run Comparison Analysis ---

# Use the lists defined in Cell 2
# comparison_results_list = [] # Already defined in Cell 2
# base_model_times = []
# tuned_model_times = []
files_processed_base = 0
files_processed_tuned = 0
files_error_base = 0
files_error_tuned = 0

if image_files_to_test: # Proceed only if files were found
    # --- Run Base Model ---
    logging.info(f"\n--- Running Base Model: {BASE_MODEL_ID} ---")
    total_start_time_base = time.time()
    for file_path in image_files_to_test: # file_path is absolute
        relative_path = os.path.relpath(file_path, project_root) # Get path relative to project root
        logging.info(f"Processing {relative_path} with BASE model...")

        # Use the wrapper function defined in Cell 2
        result_str_base, duration_base = analyze_content_wrapper(file_path, model_to_use=BASE_MODEL_ID)
        files_processed_base += 1

        result_entry = {
            "file": relative_path,
            "base_model_output": result_str_base,
            "base_model_time_sec": None,
            "tuned_model_output": "N/A", # Placeholder
            "tuned_model_time_sec": None, # Placeholder
        }

        if result_str_base.startswith("Error:") or duration_base is None:
            logging.error(f"Base model error for {relative_path}: {result_str_base}")
            files_error_base += 1
        else:
            base_model_times.append(duration_base)
            result_entry["base_model_time_sec"] = duration_base
            logging.info(f"Base model success for {relative_path} in {duration_base:.2f}s")

        comparison_results_list.append(result_entry) # Add entry even if error occurred
        time.sleep(0.5) # Small delay

    total_end_time_base = time.time()
    logging.info(f"Base model run finished. Total time: {total_end_time_base - total_start_time_base:.2f}s")
    logging.info(f"Base model: {files_processed_base - files_error_base} successful, {files_error_base} errors.")

    # --- Run Tuned Model ---
    logging.info(f"\n--- Running Tuned Model: {TUNED_MODEL_ID} ---")
    total_start_time_tuned = time.time()

    # Iterate through the existing results list to update entries
    for i in range(len(comparison_results_list)):
        entry_to_update = comparison_results_list[i]
        relative_path = entry_to_update["file"]
        # Reconstruct absolute path - ensure project_root is correct
        file_path = os.path.join(project_root, relative_path)

        logging.info(f"Processing {relative_path} with TUNED model...")

        # Use the wrapper function defined in Cell 2
        result_str_tuned, duration_tuned = analyze_content_wrapper(file_path, model_to_use=TUNED_MODEL_ID)
        files_processed_tuned += 1

        # Update the dictionary in the list
        entry_to_update["tuned_model_output"] = result_str_tuned
        if result_str_tuned.startswith("Error:") or duration_tuned is None:
            logging.error(f"Tuned model error for {relative_path}: {result_str_tuned}")
            files_error_tuned += 1
        else:
            tuned_model_times.append(duration_tuned)
            entry_to_update["tuned_model_time_sec"] = duration_tuned
            logging.info(f"Tuned model success for {relative_path} in {duration_tuned:.2f}s")
        time.sleep(0.5) # Small delay


    total_end_time_tuned = time.time()
    logging.info(f"Tuned model run finished. Total time: {total_end_time_tuned - total_start_time_tuned:.2f}s")
    logging.info(f"Tuned model: {files_processed_tuned - files_error_tuned} successful, {files_error_tuned} errors.")

    logging.info("\nComparison processing finished.")
else:
    print("Skipping analysis run as no input image files were found.")

2025-05-03 13:50:55,396 - INFO - 
--- Running Base Model: gemini-2.0-flash-lite-001 ---
2025-05-03 13:50:55,398 - INFO - Processing inputs/jpeg/1.jpeg with BASE model...
2025-05-03 13:50:55,398 - INFO - Analyzing file: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/inputs/jpeg/1.jpeg
2025-05-03 13:50:55,399 - INFO - Processing as MIME type: image/jpeg
2025-05-03 13:50:55,418 - ERROR - Failed to load or invalid image file /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/inputs/jpeg/1.jpeg: 'NoneType' object has no attribute 'close'
Traceback (most recent call last):
  File "/home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/src/vllm_handler.py", line 107, in analyze_content
    mime_type = "text/plain"
  File "/home/harishi/.local/lib/python3.10/site-packages/PIL/ImageFile.py", line 172, in verify
    self.fp.close()
AttributeError: 'NoneType' object has no attribute 'close'
2025-05-03 13:50:55,419 - ERROR - Base model error for inputs/jpeg/1.jpeg: Error

In [18]:
# --- Cell 6: Save Full Results to JSON ---

# Save the detailed comparison results
comparison_output_path = os.path.join(OUTPUT_DIR_PATH, COMPARISON_OUTPUT_FILENAME)
if comparison_results_list: # Only save if there are results
    try:
        with open(comparison_output_path, 'w', encoding='utf-8') as f:
            json.dump(comparison_results_list, f, indent=4, ensure_ascii=False)
        print(f"Full comparison results saved to: {comparison_output_path}")
    except Exception as e:
        print(f"Error saving comparison JSON: {e}")
else:
    print("No comparison results generated to save.")

Full comparison results saved to: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/outputs/comparison_base_vs_tuned_results.json


In [19]:
# --- Cell 7: Create and Display Pandas DataFrame ---

if comparison_results_list:
    # Convert the list of dictionaries to a Pandas DataFrame
    comparison_df = pd.DataFrame(comparison_results_list)

    # Save the DataFrame to CSV
    comparison_table_path = os.path.join(OUTPUT_DIR_PATH, COMPARISON_TABLE_FILENAME)
    try:
        comparison_df.to_csv(comparison_table_path, index=False)
        print(f"Comparison table saved to: {comparison_table_path}")
    except Exception as e:
        print(f"Error saving comparison CSV: {e}")


    # Display the first few rows of the DataFrame in the notebook
    print("\n--- Comparison Results Table (First 5 Rows) ---")
    pd.set_option('display.max_colwidth', 100)
    pd.set_option('display.max_rows', 10)
    display(comparison_df.head())

    # Display basic stats for timing columns (ignoring NaNs)
    print("\n--- Timing Statistics (seconds per successful file) ---")
    timing_stats = comparison_df[['base_model_time_sec', 'tuned_model_time_sec']].dropna().describe()
    if not timing_stats.empty:
         display(timing_stats)
    else:
         print("No successful timing data available for statistics.")

else:
    print("No comparison results to display in DataFrame.")

Comparison table saved to: /home/harishi/common_drive/Downloads/projects/GDG_HACKATHON/outputs/comparison_base_vs_tuned_table.csv

--- Comparison Results Table (First 5 Rows) ---


Unnamed: 0,file,base_model_output,base_model_time_sec,tuned_model_output,tuned_model_time_sec
0,inputs/jpeg/1.jpeg,Error: Could not load or invalid image file 1.jpeg.,,Error: Could not load or invalid image file 1.jpeg.,
1,inputs/jpeg/2.jpeg,Error: Could not load or invalid image file 2.jpeg.,,Error: Could not load or invalid image file 2.jpeg.,
2,inputs/jpeg/3.jpeg,Error: Could not load or invalid image file 3.jpeg.,,Error: Could not load or invalid image file 3.jpeg.,
3,inputs/jpeg/4.jpeg,Error: Could not load or invalid image file 4.jpeg.,,Error: Could not load or invalid image file 4.jpeg.,
4,inputs/jpeg/5.jpeg,Error: Could not load or invalid image file 5.jpeg.,,Error: Could not load or invalid image file 5.jpeg.,



--- Timing Statistics (seconds per successful file) ---


Unnamed: 0,base_model_time_sec,tuned_model_time_sec
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [20]:
# --- Cell 8: Generate and Display Comparison Graph ---

# Calculate average times only from successful runs
avg_time_base_ms = None
avg_time_tuned_ms = None

if base_model_times:
    avg_time_base_ms = (sum(base_model_times) / len(base_model_times)) * 1000 # Convert to ms
    print(f"Average Time Base Model (Successful Runs): {avg_time_base_ms:.2f} ms ({len(base_model_times)} files)")
else:
    print("No successful runs recorded for the base model.")

if tuned_model_times:
    avg_time_tuned_ms = (sum(tuned_model_times) / len(tuned_model_times)) * 1000 # Convert to ms
    print(f"Average Time Tuned Model (Successful Runs): {avg_time_tuned_ms:.2f} ms ({len(tuned_model_times)} files)")
else:
    print("No successful runs recorded for the tuned model.")


# Plotting only if both models had successful runs
if avg_time_base_ms is not None and avg_time_tuned_ms is not None:
    # Use more descriptive labels
    models = [f'Base\n({BASE_MODEL_ID})', f'Tuned\n(...{TUNED_MODEL_ID[-12:]})'] # Show last part of tuned ID
    avg_times = [avg_time_base_ms, avg_time_tuned_ms]

    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(models, avg_times, color=['skyblue', 'lightgreen'])
    ax.set_ylabel('Average Time per Image (ms)')
    ax.set_title('Base vs. Tuned Model Speed Comparison (Avg. Time for Successful Analyses)')
    ax.set_ylim(0, max(avg_times) * 1.2)

    # Add text labels to bars
    for bar in bars:
        yval = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2.0, yval, f'{yval:.0f} ms', va='bottom', ha='center', fontsize=9)

    # Save the graph
    graph_path = os.path.join(OUTPUT_DIR_PATH, GRAPH_OUTPUT_FILENAME)
    try:
        plt.savefig(graph_path)
        print(f"\nComparison graph saved to: {graph_path}")
    except Exception as e:
        print(f"\nError saving comparison graph: {e}")

    # Display the plot inline in the notebook
    plt.show()

else:
    print("\nCannot generate time comparison graph: Insufficient successful runs for one or both models.")

No successful runs recorded for the base model.
No successful runs recorded for the tuned model.

Cannot generate time comparison graph: Insufficient successful runs for one or both models.
