In [1]:
import os
import json

from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
from datasets import load_dataset

from document_processor import TextProcessor, ImageProcessor, PageImageProcessor, ImageTextualSummaryProcessor, ImageTextualSummaryProcessorLarge
from multimodal_rag import MultimodalRAG
from embedder import OpenAIEmbedder, ColPaliEmbedder
from pdf_to_qa import generate_qa_for_pdf, generate_chartQA_pdf_and_json
from evaluation import evaluate_generation, evaluate_generation_chartQA, compute_mrr_at_k, compute_recall_at_k, compute_precision_at_k, compute_f1_score, compute_map_at_k

first gen of gold answers no batching

In [None]:
from common_utils import call_gpt_4
import base64
from openai import OpenAI
import os
import json
import base64
from typing import List
from ragas.evaluation import evaluate
from ragas.metrics import AnswerCorrectness, MultiModalRelevance, MultiModalFaithfulness
from ragas import evaluate, EvaluationDataset
from datasets import Dataset
from tqdm import tqdm

# Load and format gold dataset
def load_gold_dataset(gold_path) -> List[dict]:
    with open(gold_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Load your dataset (assume it's already a list of dicts)
gold_entries = load_gold_dataset("QA_coinqa_gold_final.json")

# Process each entry
for entry in tqdm(gold_entries, desc="Processing entries"):
    base64_str = entry["image_base64"]
    question = entry["question"]

    user_prompt = [
        {"type": "text", "text": question},
        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
    ]

    try:
        generated_answer = call_gpt_4(user_prompt)
        entry["generated_answer"] = generated_answer
    except Exception as e:
        entry["generated_answer"] = f"Error: {str(e)}"

# Optionally, save back to a file
import json
with open("QA_coinqa_with_generated_answers.json", "w") as f:
    json.dump(gold_entries, f, indent=2)

second with batching

In [None]:
from common_utils import call_gpt_4
import json
import os
import time
from typing import List
from tqdm import tqdm

# Load gold dataset
def load_gold_dataset(gold_path) -> List[dict]:
    with open(gold_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Paths
input_path = "json_files/ChartQA_QA_Mapping.json"
output_path = "json_files/ChartQA_QA_Mapping_with_generated_answers.json"


gold_entries = load_gold_dataset(input_path)

# Load existing results if they exist
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        saved_entries = json.load(f)
    completed_questions = {
        (e["page_number"], e["question"]) for e in saved_entries if "generated_answer" in e
    }
else:
    saved_entries = []
    completed_questions = set()

# Process and save
for i, entry in enumerate(tqdm(gold_entries, desc="Processing entries")):
    key = (entry["page_number"], entry["question"])
    
    if key in completed_questions:
        continue  # Skip already processed

    base64_str = entry["image_base64"]
    question = entry["question"]

    user_prompt = [
        {"type": "text", "text": question},
        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_str}"}}
    ]

    # Retry loop for rate limits
    max_retries = 5
    for attempt in range(max_retries):
        try:
            generated_answer = call_gpt_4(user_prompt)
            entry["generated_answer"] = generated_answer
            break  # Success, exit retry loop
        except Exception as e:
            error_message = str(e).lower()
            if "rate limit" in error_message or "429" in error_message:
                wait_time = 20 * (attempt + 1)  # Increasing delay on each retry
                print(f"Rate limit hit. Sleeping for {wait_time} seconds (attempt {attempt + 1}/{max_retries})...")
                time.sleep(wait_time)
            else:
                entry["generated_answer"] = f"Error: {str(e)}"
                break

    saved_entries.append(entry)

    # Save every 10 entries
    if len(saved_entries) % 10 == 0:
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(saved_entries, f, indent=2)

    # Fixed delay after each entry to prevent hitting rate limits
    time.sleep(20)

# Final save
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(saved_entries, f, indent=2)


Processing entries: 100%|██████████| 55/55 [23:43<00:00, 25.88s/it]


In [None]:
gold_entries = load_gold_dataset("QA_coinqa_with_generated_answers.json")
for item in gold_entries:
    print(f"Generated Answer: {item['generated_answer']}")
    print(f"Reference Answer: {item['answer']}")
    print()
    

## Evaluate generation

In [10]:
import os
import json
import base64
from typing import List
from ragas.evaluation import evaluate
from ragas.metrics import AnswerCorrectness, MultiModalRelevance, MultiModalFaithfulness
from ragas import evaluate, EvaluationDataset
from datasets import Dataset


evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) # For Ragas evaluation

# Load and format gold dataset
def load_gold_dataset(gold_path) -> List[dict]:
    with open(gold_path, 'r', encoding='utf-8') as f:
        return json.load(f)


# Format entries for Ragas
def format_for_ragas(gold_entries: List[dict]) -> List[dict]:
    formatted = []
    for item in gold_entries:
        formatted.append({
            "user_input": item["question"],
            #"retrieved_contexts": [item["image_base64"]],
            "response": item["generated_answer"],
            "reference": item["answer"]
        })
    return formatted

# Define the file path for storing the results
results_folder = "resultsbatch"
os.makedirs(results_folder, exist_ok=True)

def evaluate_generation_chartQA(rag_answers: List[dict], evaluator_llm):
    evaluation_dataset = EvaluationDataset.from_list(rag_answers)
    result = evaluate(
        dataset=evaluation_dataset, 
        metrics=[AnswerCorrectness()], 
        llm=evaluator_llm,
    )
    return result  # Returning raw result for debugging

# Function to evaluate in batches
def evaluate_in_batches(rag_generated_answers, evaluator_llm, batch_size=10):
    total_metrics = {"faithful_rate": 0.0, "relevance_rate": 0.0, "answer_correctness": 0.0}
    total_batches = len(rag_generated_answers) // batch_size + (1 if len(rag_generated_answers) % batch_size > 0 else 0)
    
    for batch_index in range(total_batches):
        # Define the batch's results file path
        batch_file_path = os.path.join(results_folder, f"batch_{batch_index+1}.json")
        
        # Check if the batch result already exists
        if os.path.exists(batch_file_path):
            print(f"Batch {batch_index+1} results already exist: {batch_file_path}")
            with open(batch_file_path, 'r', encoding='utf-8') as file:
                batch_results = json.load(file)
                print(f"Loaded batch {batch_index+1} results: {batch_results}")
        else:
            # Create a subset of the answers for this batch
            batch_start = batch_index * batch_size
            batch_end = min((batch_index + 1) * batch_size, len(rag_generated_answers))
            subset_rag_generated_answers = rag_generated_answers[batch_start:batch_end]
            
            # Evaluate the batch
            faithfulness_and_relevance = str(evaluate_generation_chartQA(subset_rag_generated_answers, evaluator_llm))
            
            # Replace single quotes with double quotes
            json_string = faithfulness_and_relevance.replace("'", '"')
            
            # Save the batch results
            with open(batch_file_path, 'w', encoding='utf-8') as f:
                f.write(json_string)  # Write string instead of using json.dump()
            print(f"Results for batch {batch_index+1} saved to: {batch_file_path}")
            
            # Load the batch results into the dictionary
            batch_results = json.loads(json_string)
        
        # Accumulate metrics for averaging
        total_metrics["faithful_rate"] += batch_results.get("faithful_rate", 0)
        total_metrics["relevance_rate"] += batch_results.get("relevance_rate", 0)
        total_metrics["answer_correctness"] += batch_results.get("answer_correctness", 0)
    
    # Calculate the average of the metrics
    total_batches = max(1, total_batches)  # Avoid division by zero in case of empty input
    average_metrics = {
        "average_faithful_rate": total_metrics["faithful_rate"] / total_batches,
        "average_relevance_rate": total_metrics["relevance_rate"] / total_batches,
        "average_answer_correctness": total_metrics["answer_correctness"] / total_batches
    }

    return average_metrics

# Example usage of the function
if __name__ == "__main__":
    gold_entries = load_gold_dataset("QA_coinqa_with_generated_answers.json")
    #gold_entries = load_gold_dataset("json_files/ChartQA_QA_Mapping.json")

    # Format for evaluation
    formatted = format_for_ragas(gold_entries)

    # Assuming rag_generated_answers and evaluator_llm are already defined
    average_metrics = evaluate_in_batches(formatted, evaluator_llm)
    print("Final average metrics:", average_metrics)



Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Results for batch 1 saved to: resultsbatch/batch_1.json


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Results for batch 2 saved to: resultsbatch/batch_2.json


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Results for batch 3 saved to: resultsbatch/batch_3.json


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Results for batch 4 saved to: resultsbatch/batch_4.json


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Results for batch 5 saved to: resultsbatch/batch_5.json


Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

Results for batch 6 saved to: resultsbatch/batch_6.json
Final average metrics: {'average_faithful_rate': 0.0, 'average_relevance_rate': 0.0, 'average_answer_correctness': 0.5320666666666667}


dont know what the code is below

In [None]:
import os
import json
import base64
import time
from typing import List
from ragas.evaluation import evaluate
from ragas.metrics import AnswerCorrectness, MultiModalRelevance, MultiModalFaithfulness
from ragas import evaluate, EvaluationDataset
from datasets import Dataset
from openai import RateLimitError
from ragas.llms import LangchainLLMWrapper
from langchain.chat_models import ChatOpenAI

# Initialize LLM wrapper
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))

# Load and format gold dataset
def load_gold_dataset(gold_path) -> List[dict]:
    with open(gold_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# Format entries for Ragas
def format_for_ragas(gold_entries: List[dict]) -> List[dict]:
    formatted = []
    for item in gold_entries:
        formatted.append({
            "user_input": item["question"],
            "retrieved_contexts": [item["image_base64"]],
            "response": item["answer"],
            "reference": item["answer"]
        })
    return formatted

# Define the file path for storing the results
results_folder = "resultsbatch"
os.makedirs(results_folder, exist_ok=True)

# Evaluation function
def evaluate_generation_chartQA(rag_answers: List[dict], evaluator_llm):
    evaluation_dataset = EvaluationDataset.from_list(rag_answers)
    result = evaluate(
        dataset=evaluation_dataset,
        metrics=[MultiModalFaithfulness(), MultiModalRelevance()],
        llm=evaluator_llm,
    )
    return result

# Batch evaluation function with retry and sleep
def evaluate_in_batches(rag_generated_answers, evaluator_llm, batch_size=50):
    total_metrics = {"faithful_rate": 0.0, "relevance_rate": 0.0, "answer_correctness": 0.0}
    total_batches = len(rag_generated_answers) // batch_size + (1 if len(rag_generated_answers) % batch_size > 0 else 0)

    for batch_index in range(total_batches):
        batch_file_path = os.path.join(results_folder, f"batch_{batch_index+1}.json")

        if os.path.exists(batch_file_path):
            print(f"Batch {batch_index+1} results already exist: {batch_file_path}")
            with open(batch_file_path, 'r', encoding='utf-8') as file:
                batch_results = json.load(file)
                print(f"Loaded batch {batch_index+1} results: {batch_results}")
        else:
            batch_start = batch_index * batch_size
            batch_end = min((batch_index + 1) * batch_size, len(rag_generated_answers))
            subset_rag_generated_answers = rag_generated_answers[batch_start:batch_end]

            # Retry logic for rate limiting
            max_retries = 5
            retry_delay = 2  # seconds
            for attempt in range(max_retries):
                try:
                    faithfulness_and_relevance = str(evaluate_generation_chartQA(subset_rag_generated_answers, evaluator_llm))
                    break
                except RateLimitError as e:
                    print(f"Rate limit hit on batch {batch_index+1} (attempt {attempt+1}/{max_retries}). Retrying in {retry_delay}s...")
                    time.sleep(retry_delay)
                    retry_delay *= 2
            else:
                raise RuntimeError(f"Failed to evaluate batch {batch_index+1} after {max_retries} retries due to rate limits.")

            # Convert to JSON string and save
            json_string = faithfulness_and_relevance.replace("'", '"')
            with open(batch_file_path, 'w', encoding='utf-8') as f:
                f.write(json_string)
            print(f"Results for batch {batch_index+1} saved to: {batch_file_path}")

            batch_results = json.loads(json_string)

            # Sleep between batches to prevent hitting limits
            time.sleep(1)

        # Accumulate metrics
        total_metrics["faithful_rate"] += batch_results.get("faithful_rate", 0)
        total_metrics["relevance_rate"] += batch_results.get("relevance_rate", 0)
        total_metrics["answer_correctness"] += batch_results.get("answer_correctness", 0)

    # Compute averages
    total_batches = max(1, total_batches)
    average_metrics = {
        "average_faithful_rate": total_metrics["faithful_rate"] / total_batches,
        "average_relevance_rate": total_metrics["relevance_rate"] / total_batches,
        "average_answer_correctness": total_metrics["answer_correctness"] / total_batches
    }

    return average_metrics

# # Example usage
# if __name__ == "__main__":
#     gold_entries = load_gold_dataset("json_files/ChartQA_QA_Mapping.json")
#     formatted = format_for_ragas(gold_entries)
#     average_metrics = evaluate_in_batches(formatted, evaluator_llm)
#     print("Final average metrics:", average_metrics)


In [None]:
import os
import json

def calculate_average_metrics(results_folder):
    total_metrics = {"faithful_rate": 0.0, "relevance_rate": 0.0, "answer_correctness": 0.0}
    file_count = 0

    for filename in os.listdir(results_folder):
        if filename.endswith(".json"):
            file_path = os.path.join(results_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                metrics = json.load(f)
                total_metrics["faithful_rate"] += metrics.get("faithful_rate", 0)
                total_metrics["relevance_rate"] += metrics.get("relevance_rate", 0)
                total_metrics["answer_correctness"] += metrics.get("answer_correctness", 0)
                file_count += 1

    if file_count == 0:
        print("No result files found in the folder.")
        return None

    average_metrics = {
        "average_faithful_rate": total_metrics["faithful_rate"] / file_count,
        "average_relevance_rate": total_metrics["relevance_rate"] / file_count,
        "average_answer_correctness": total_metrics["answer_correctness"] / file_count
    }

    return average_metrics

# Calculate and print the average metrics
average_metrics = calculate_average_metrics(results_folder)
print("Average Metrics:", average_metrics)

In [None]:
import base64
import json
from tqdm import tqdm
from PIL import Image
import io
import os

def generate_chartQA_json_batched(dataset, output_path='json_files/ChartQA_QA_Mapping.json', batch_size=50, max_entries=200):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    qa_list = []
    current_page = 0
    processed_count = 0
    batch_progress = 1

    for data in tqdm(dataset, desc="Processing Charts", unit="chart"):
        if data.get('type') != 'human_test':
            continue

        image = data['image']

        # Convert image to bytes and encode as base64
        image_bytes = io.BytesIO()
        image.save(image_bytes, format="PNG")
        image_bytes = image_bytes.getvalue()
        image_base64 = base64.b64encode(image_bytes).decode('utf-8')

        current_page += 1

        qa_list.append({
            'page_number': current_page,
            'question': data['question'],
            'answer': data['answer'],
            'image_base64': image_base64,
            'type': data['type']
        })

        processed_count += 1

        if processed_count % batch_size == 0:
            print(f'📦 Processed batch {batch_progress} ({processed_count} entries so far)')
            batch_progress += 1

        if processed_count >= max_entries:
            break

    # Save all results to one JSON file
    with open(output_path, 'w') as f:
        json.dump(qa_list, f, indent=4)

    print(f'✅ Done! Saved {len(qa_list)} entries to: {output_path}')



# Example usage (assuming the dataset is already loaded)
from datasets import load_dataset
chartqa = load_dataset('lmms-lab/ChartQA', split='test')
generate_chartQA_json_batched(chartqa, output_path='json_files/ChartQA_QA_Mapping.json', batch_size=50, max_entries=1300)


In [None]:
from PIL import Image as PILImage
import base64
import io
import json
from typing import List

def format_for_ragas(gold_entries: List[dict]) -> List[dict]:
    formatted = []
    for item in gold_entries:
        formatted.append({
            "user_input": item["question"],
            "retrieved_contexts": [item["image_base64"]],
            "response": item["answer"],
            "reference": item["answer"]
        })
    return formatted

# Resize base64 image to a reasonable width (e.g. 400px)
def resize_base64_image(base64_str: str, target_width: int = 400) -> str:
    image_data = base64.b64decode(base64_str)
    with PILImage.open(io.BytesIO(image_data)) as img:
        # Maintain aspect ratio
        ratio = target_width / float(img.width)
        target_height = int(img.height * ratio)
        resized_img = img.resize((target_width, target_height), PILImage.Resampling.LANCZOS)

        # Convert back to base64
        buffer = io.BytesIO()
        resized_img.save(buffer, format="PNG")
        return base64.b64encode(buffer.getvalue()).decode('utf-8')

# Display and evaluate selected images
# def evaluate_selected_images(gold_path: str, image_indices: List[int], evaluator_llm):
#     gold_entries = load_gold_dataset(gold_path)
    
#     # Check if indices are valid
#     max_index = len(gold_entries) - 1
#     for idx in image_indices:
#         if idx > max_index or idx < 0:
#             raise IndexError(f"Index {idx} is out of range. Dataset has {len(gold_entries)} entries.")
        
#     selected_entries = [gold_entries[i] for i in image_indices]

#     # Resize images before formatting
#     for entry in selected_entries:
#         if "image_base64" in entry:
#             entry["image_base64"] = resize_base64_image(entry["image_base64"])

#     formatted_entries = format_for_ragas(selected_entries)

#     # Print all retrieved context images
#     for entry in formatted_entries:
#         print("Context Images:")
#         for idx, context in enumerate(entry["retrieved_contexts"]):
#             print(f"Image {idx + 1}:")
#             print(f"Quetsion : {entry['user_input']}")
#             print(f"Answer : {entry['response']}")
#             display(PILImage.open(io.BytesIO(base64.b64decode(context))))

#     # Evaluate
#     results = evaluate_generation_chartQA(formatted_entries, evaluator_llm)
#     print("\nEvaluation Metrics:")
#     print(results)
#     return results

def evaluate_selected_images(gold_path: str, image_indices: List[int], evaluator_llm):
    gold_entries = load_gold_dataset(gold_path)

    # Check if indices are valid
    max_index = len(gold_entries) - 1
    for idx in image_indices:
        if idx > max_index or idx < 0:
            raise IndexError(f"Index {idx} is out of range. Dataset has {len(gold_entries)} entries.")

    # Process each image independently
    for i in image_indices:
        entry = gold_entries[i]

        # Resize image
        if "image_base64" in entry:
            entry["image_base64"] = resize_base64_image(entry["image_base64"])

        # Format entry for evaluation
        formatted_entry = format_for_ragas([entry])

        # Display context image and related info
        print(f"\n--- Evaluation for Entry {i} ---")
        print("Context Image:")
        print(f"Question: {formatted_entry[0]['user_input']}")
        print(f"Answer: {formatted_entry[0]['response']}")
        display(PILImage.open(io.BytesIO(base64.b64decode(formatted_entry[0]['retrieved_contexts'][0]))))

        # Evaluate this entry
        result = evaluate_generation_chartQA(formatted_entry, evaluator_llm)

        # Print evaluation result
        print("Evaluation Metrics:")
        print(result)


# Example: Evaluate image 3 and 7 from the dataset
#results = evaluate_selected_images("json_files/ChartQA_QA_Mapping.json", [1,2], evaluator_llm)

with open("QA_coinqa_gold_final.json", 'r', encoding='utf-8') as f:
    qa_data = json.load(f)
#image_indices = list(range(1,)))

# Process the first third of the images
#image_indices = list(range(len(qa_data) // 3))

# Code for the second third of the images (commented out)
#image_indices = list(range(len(qa_data) // 3, 2 * len(qa_data) // 3))

# Code for the last third of the images (commented out)
#image_indices = list(range(2 * len(qa_data) // 3, len(qa_data)))

image_indices = list(range(10, 21))
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini")) # For Ragas evaluation
results = evaluate_selected_images("QA_coinqa_gold_final.json", list(range(52, 54)), evaluator_llm)


In [None]:
import random
import json
from typing import List
from json import JSONDecodeError

def evaluate_selected_images(gold_path: str, image_indices: List[int], evaluator_llm, batch_size: int = 10):
    gold_entries = load_gold_dataset(gold_path)
    zero_entries = []
    faithfulness_zeros = 0
    relevance_zeros = 0
    total_entries = 0
    skipped_entries = []

    # Check if indices are valid
    max_index = len(gold_entries) - 1
    for idx in image_indices:
        if idx > max_index or idx < 0:
            raise IndexError(f"Index {idx} is out of range. Dataset has {len(gold_entries)} entries.")

    # Process in batches, but evaluate individually
    for batch_start in range(0, len(image_indices), batch_size):
        batch_indices = image_indices[batch_start:batch_start + batch_size]

        for i in batch_indices:
            entry = gold_entries[i]
            total_entries += 1

            try:
                # Resize image if exists
                if "image_base64" in entry:
                    entry["image_base64"] = resize_base64_image(entry["image_base64"])

                # Format and evaluate this single entry
                formatted_entry = format_for_ragas([entry])
                raw_result = str(evaluate_generation_chartQA(formatted_entry, evaluator_llm))
                json_string = raw_result.replace("'", '"')
                result = json.loads(json_string)

                # Extract metrics
                faithfulness_score = result.get("faithful_rate")
                relevance_score = result.get("relevance_rate")

                if faithfulness_score == 0 or relevance_score == 0:
                    zero_entries.append({
                        "entry_index": i,
                        "faithfulness": faithfulness_score,
                        "relevance": relevance_score
                    })

                if faithfulness_score == 0:
                    faithfulness_zeros += 1
                if relevance_score == 0:
                    relevance_zeros += 1

            except (JSONDecodeError, KeyError) as e:
                print(f"⚠️ JSON decode or key error at entry {i}: {e}")
                skipped_entries.append(i)

            except Exception as e:
                print(f"❌ Error processing entry {i}: {e}")
                skipped_entries.append(i)

    # Summary
    print("\nEntries with 0 in either Faithfulness or Relevance:")
    for ze in zero_entries:
        print(f"Entry {ze['entry_index']}: Faithfulness = {ze['faithfulness']}, Relevance = {ze['relevance']}")

    print("\nShare of 0 scores:")
    print(f"Faithfulness: {faithfulness_zeros}/{total_entries} ({(faithfulness_zeros / total_entries * 100):.2f}%)")
    print(f"Relevance: {relevance_zeros}/{total_entries} ({(relevance_zeros / total_entries * 100):.2f}%)")

    if skipped_entries:
        print("\n⚠️ Skipped entries due to errors:", skipped_entries)


# Load QA data and choose indices
with open("json_files/ChartQA_QA_Mapping.json", 'r', encoding='utf-8') as f:
    qa_data = json.load(f)

image_indices = random.sample(range(len(qa_data)), 50)

# Evaluation
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluate_selected_images("json_files/ChartQA_QA_Mapping.json", image_indices, evaluator_llm)
