In [7]:
import json

def calculate_average_metrics(file_path):
    """
    Loads JSON data from a file and calculates average metrics.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except FileNotFoundError:
        return {"Error": f"File '{file_path}' not found."}
    except json.JSONDecodeError:
        return {"Error": f"File '{file_path}' contains invalid JSON."}

    totals = {}
    count = 0

    for video_key, video_data in data.items():
        # Extract the score dictionary
        score_data = video_data.get("score", {})

        # Accumulate sums
        for metric, value in score_data.items():
            if metric not in totals:
                totals[metric] = 0.0
            totals[metric] += value
        
        count += 1

    # Calculate averages
    averages = {}
    if count > 0:
        for metric, total_sum in totals.items():
            averages[metric] = round(total_sum / count, 2)

    return averages

In [8]:
result = calculate_average_metrics("results/evaluation_QA_results.json")
print("Average Metrics: Llama 70B QA Baseline")
print(json.dumps(result, indent=2))


result = calculate_average_metrics("results/evaluation_NQA_results.json")
print("Average Metrics: No QA (Includes visual summary, transcript passed to Llama 70B model)")
print(json.dumps(result, indent=2))

result = calculate_average_metrics("results/evaluation_VS_results.json")
print("Average Metrics: VLM Only (Generate caption by passing transcript and video to VLM)")
print(json.dumps(result, indent=2))

Average Metrics: Llama 70B QA Baseline
{
  "Factual Accuracy": 1.64,
  "Completeness": 1.64,
  "Visual Enrichment": 1.73,
  "Clarity": 1.18,
  "Total Score": 30.91
}
Average Metrics: No QA (Includes visual summary, transcript passed to Llama 70B model)
{
  "Factual Accuracy": 1.55,
  "Completeness": 1.82,
  "Visual Enrichment": 2.0,
  "Clarity": 1.45,
  "Total Score": 34.09
}
Average Metrics: VLM Only (Generate caption by passing transcript and video to VLM)
{
  "Factual Accuracy": 1.55,
  "Completeness": 1.18,
  "Visual Enrichment": 1.73,
  "Clarity": 1.18,
  "Total Score": 28.18
}
