In [9]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Configure API
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in .env file")
genai.configure(api_key=api_key)

# List available models
for model in genai.list_models():
    print(f"Model: {model.name}, Supported Methods: {model.supported_generation_methods}")

Model: models/chat-bison-001, Supported Methods: ['generateMessage', 'countMessageTokens']
Model: models/text-bison-001, Supported Methods: ['generateText', 'countTextTokens', 'createTunedTextModel']
Model: models/embedding-gecko-001, Supported Methods: ['embedText', 'countTextTokens']
Model: models/gemini-1.0-pro-vision-latest, Supported Methods: ['generateContent', 'countTokens']
Model: models/gemini-pro-vision, Supported Methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-pro-latest, Supported Methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-pro-001, Supported Methods: ['generateContent', 'countTokens', 'createCachedContent']
Model: models/gemini-1.5-pro-002, Supported Methods: ['generateContent', 'countTokens', 'createCachedContent']
Model: models/gemini-1.5-pro, Supported Methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-flash-latest, Supported Methods: ['generateContent', 'countTokens']
Model: models/gemini-1.5-flash-001, Su

In [3]:
import os
import json
from google.generativeai import GenerativeModel
import google.generativeai as genai
from PIL import Image
from dotenv import load_dotenv
import base64
from io import BytesIO
import time

# Load environment variables from .env file
load_dotenv()

# Step 1: Setup API Configuration
def configure_api():
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise ValueError("GEMINI_API_KEY not found in .env file")
    genai.configure(api_key=api_key)
    return GenerativeModel('gemini-2.0-flash')  # Adjust if needed

# Step 2: Load and Process Image
def load_image(image_path):
    try:
        img = Image.open(image_path)
        width, height = img.size
        return img, width, height
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None, None, None

# Step 3: Convert Image to Base64
def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Step 4: Generate Scene Graph using Gemini API
def generate_scene_graph(model, image, width, height, image_name):
    prompt = """
    Generate a detailed image scene graph of this image using the Thai language based on this template

    {
       "width": 640,
       "height": 480,
       "location": "living room",
       "weather": "none",
       "objects": {
           "271881": {
               "name": "chair",
               "x": 220,
               "y": 310,
               "w": 50,
               "h": 80,
               "attributes": ["brown", "wooden", "small"],
               "relations": {
                   "32452": {
                       "name": "on",
                       "object": "275312"
                   },
                   "32453": {
                       "name": "near",
                       "object": "279472"
                   }
               }
           }
       }
    }
    """
    
    image_base64 = image_to_base64(image)
    try:
        start_time = time.time()
        
        response = model.generate_content([
            prompt,
            {"mime_type": "image/jpeg", "data": image_base64}
        ])
        
        end_time = time.time()
        duration = end_time - start_time
        print(f"Scene graph generation for {image_name} took {duration:.2f} seconds")
        
        raw_response = response.text.strip()
        if not raw_response:
            print(f"Error: Empty response for scene graph {image_name}")
            return None
            
        try:
            raw_response = raw_response.replace("```json", "").replace("```", "")
            scene_graph = json.loads(raw_response)
            return scene_graph
        except json.JSONDecodeError as e:
            print(f"JSON parsing error for scene graph {image_name}: {e}")
            print(f"Raw response length: {len(raw_response)}")
            print(f"Raw response (first 100 chars): {raw_response[:100]}")
            return None
    except Exception as e:
        print(f"Error generating scene graph for {image_name}: {e}")
        return None

# Step 5: Generate GQA Questions
def generate_gqa_questions(model, scene_graph, image, image_name):
    prompt = f"""
    Based on the following scene graph, generate 20 GQA-style questions in English with this format:
    
        question: Is there a red apple on the table?,
        answer: no,
        fullAnswer: No, there is an apple but it is green.
    but in json format
    
    Scene graph: {json.dumps(scene_graph, ensure_ascii=False)}
    
    Return the questions as a JSON array.
    """
    
    image_base64 = image_to_base64(image)
    try:
        start_time = time.time()
        
        response = model.generate_content([
            prompt,
            {"mime_type": "image/jpeg", "data": image_base64}
        ])
        
        end_time = time.time()
        duration = end_time - start_time
        print(f"GQA question generation for {image_name} took {duration:.2f} seconds")
        
        raw_response = response.text.strip()
        if not raw_response:
            print(f"Error: Empty response for GQA questions {image_name}")
            return None
            
        try:
            raw_response = raw_response.replace("```json", "").replace("```", "")
            gqa_questions = json.loads(raw_response)
            return gqa_questions
        except json.JSONDecodeError as e:
            print(f"JSON parsing error for GQA questions {image_name}: {e}")
            print(f"Raw response length: {len(raw_response)}")
            print(f"Raw response (first 100 chars): {raw_response[:100]}")
            return None
    except Exception as e:
        print(f"Error generating GQA questions for {image_name}: {e}")
        return None

# Step 6: Save Scene Graph and GQA Questions
def save_output(scene_graph, gqa_questions, output_path_base):
    try:
        # Save scene graph
        scene_graph_path = f"{output_path_base}_scene_graph.json"
        with open(scene_graph_path, 'w', encoding='utf-8') as f:
            json.dump(scene_graph, f, ensure_ascii=False, indent=4)
        print(f"Scene graph saved to {scene_graph_path}")
        
        # Save GQA questions
        if gqa_questions:
            gqa_path = f"{output_path_base}_gqa_questions.json"
            with open(gqa_path, 'w', encoding='utf-8') as f:
                json.dump(gqa_questions, f, ensure_ascii=False, indent=4)
            print(f"GQA questions saved to {gqa_path}")
            
    except Exception as e:
        print(f"Error saving output to {output_path_base}: {e}")

# Main Pipeline
def run_pipeline(image_folder='data\\sea-vqa\\thailand', output_folder='output'):
    os.makedirs(output_folder, exist_ok=True)
    
    try:
        model = configure_api()
    except ValueError as e:
        print(e)
        return
    
    for image_name in os.listdir(image_folder):
        if image_name.lower().endswith('.jpeg'):
            image_path = os.path.join(image_folder, image_name)
            img, width, height = load_image(image_path)
            
            if img is None:
                continue
                
            # Generate scene graph
            scene_graph = generate_scene_graph(model, img, width, height, image_name)
            
            if scene_graph is None:
                continue
                
            # Generate GQA questions
            gqa_questions = generate_gqa_questions(model, scene_graph, img, image_name)
            
            # Save both outputs
            output_path_base = os.path.join(output_folder, f"{image_name.split('.')[0]}")
            save_output(scene_graph, gqa_questions, output_path_base)

# Run the pipeline
if __name__ == "__main__":
    run_pipeline()

Scene graph generation for thailand_12006.jpeg took 8.18 seconds
GQA question generation for thailand_12006.jpeg took 7.82 seconds
Scene graph saved to output\thailand_12006_scene_graph.json
GQA questions saved to output\thailand_12006_gqa_questions.json
Scene graph generation for thailand_12007.jpeg took 7.01 seconds
GQA question generation for thailand_12007.jpeg took 7.59 seconds
Scene graph saved to output\thailand_12007_scene_graph.json
GQA questions saved to output\thailand_12007_gqa_questions.json
Scene graph generation for thailand_12008.jpeg took 6.87 seconds


KeyboardInterrupt: 