# Evaluation of SQLQueryChain for Shipment queries (COSC 304 from Dr. Ramon Lawrence)

In [29]:
import json
import pandas as pd
from datetime import timedelta

# Load your GPT-4 and GPT-3.5 result JSON files
with open("./results/12_sql_queries_gpt4_shipment.json", "r") as f:
    gpt4_data = json.load(f)["queries"]

with open("./results/9_sql_queries_gpt3.5_shipment.json", "r") as f:
    gpt3_data = json.load(f)["queries"]

def detailed_summary(data, experiment_name):
    type_counts = {"simple": 0, "medium": 0, "complex": 0}
    result_correct = {"simple": 0, "medium": 0, "complex": 0}
    
    # For averaging similarity and column match scores
    similarity_sum = {"simple": 0, "medium": 0, "complex": 0}
    column_match_sum = {"simple": 0, "medium": 0, "complex": 0}
    
    # Token and cost tracking
    input_tokens = output_tokens = total_tokens = total_cost = total_time = 0

    for q in data:
        q_type = q["type"]
        type_counts[q_type] += 1

        if q["result"]:
            result_correct[q_type] += 1

        # Add similarity and column match scores
        similarity_sum[q_type] += q.get("similarity", 0)
        column_match_sum[q_type] += q.get("column_matching_index", 0)

        # Token and cost metrics
        input_tokens += q.get("prompt_tokens", 0)
        output_tokens += q.get("completion_tokens", 0)
        total_tokens += q.get("total_tokens", 0)
        total_cost += q.get("total_cost", 0)
        total_time += q.get("time", 0)

    total_queries = sum(type_counts.values())
    correct_total = sum(result_correct.values())

    def safe_div(n, d):
        return round(n / d, 4) if d else 0

    # Generate the summary dictionary
    return {
        "experiment": experiment_name,
        
        # Query counts
        "simple": type_counts["simple"],
        "medium": type_counts["medium"],
        "complex": type_counts["complex"],
        "total": total_queries,

        # Accuracy per type
        "overall_simple": safe_div(result_correct["simple"], type_counts["simple"]),
        "overall_medium": safe_div(result_correct["medium"], type_counts["medium"]),
        "overall_complex": safe_div(result_correct["complex"], type_counts["complex"]),
        "overall_total": safe_div(correct_total, total_queries),

        # Average similarity scores
        "avg_similarity_simple": safe_div(similarity_sum["simple"], type_counts["simple"]),
        "avg_similarity_medium": safe_div(similarity_sum["medium"], type_counts["medium"]),
        "avg_similarity_complex": safe_div(similarity_sum["complex"], type_counts["complex"]),
        "avg_similarity_total": safe_div(sum(similarity_sum.values()), total_queries),

        # Average column match scores
        "avg_colmatch_simple": safe_div(column_match_sum["simple"], type_counts["simple"]),
        "avg_colmatch_medium": safe_div(column_match_sum["medium"], type_counts["medium"]),
        "avg_colmatch_complex": safe_div(column_match_sum["complex"], type_counts["complex"]),
        "avg_colmatch_total": safe_div(sum(column_match_sum.values()), total_queries),

        # Token and cost metrics
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "total_tokens": total_tokens,
        "total_cost": round(total_cost, 2),

        # Total time formatted as H:M:S
        "total_time": str(timedelta(seconds=round(total_time)))
    }

# Create summaries for both experiments
gpt4_summary = detailed_summary(gpt4_data, "SQLQueryChain - SHIPMENT - GPT-4")
gpt3_summary = detailed_summary(gpt3_data, "SQLQueryChain - SHIPMENT - GPT-3.5")

# Combine into a dataframe and display
summary_df = pd.DataFrame([gpt4_summary, gpt3_summary])

# Optional: sort by accuracy
summary_df = summary_df.sort_values(by="overall_total", ascending=False)

# Show the result
summary_df



Unnamed: 0,experiment,simple,medium,complex,total,overall_simple,overall_medium,overall_complex,overall_total,avg_similarity_simple,...,avg_similarity_total,avg_colmatch_simple,avg_colmatch_medium,avg_colmatch_complex,avg_colmatch_total,input_tokens,output_tokens,total_tokens,total_cost,total_time
0,SQLQueryChain - SHIPMENT - GPT-4,5,6,9,20,0.8,0.6667,0.7778,0.75,0.98,...,0.905,0.96,0.8833,0.9222,0.92,11443,1485,12928,0.43,0:01:15
1,SQLQueryChain - SHIPMENT - GPT-3.5,5,6,9,20,0.6,0.8333,0.4444,0.6,0.92,...,0.855,0.96,0.9167,0.9111,0.925,11443,1667,13110,0.04,0:00:24


In [None]:
inverted_df = summary_df.transpose()

inverted_df = inverted_df.round(2)

#Rename Columns

inverted_df.columns = ["GPT-4", "GPT-3.5"]

# Display
inverted_df

# Simple: Number of simple queries evaluated
# Medium: Number of medium queries evaluated
# Complex: Number of complex queries evaluated
# Total: Total number of queries evaluated
# overall_simple: Accuracy of simple queries (i.e. how many of the simple were true/correct)
# Overall_medium: Accuracy of medium queries (i.e. how many of the medium were true/correct)
# Overall_complex: Accuracy of complex queries (i.e. how many of the complex were true/correct)
# Overall_total: Accuracy of all queries (i.e. how many of the total were true/correct)
# Avg_similarity_simple: Average similarity score for simple queries
# Avg_similarity_medium: Average similarity score for medium queries
# Avg_similarity_complex: Average similarity score for complex queries
# Avg_similarity_total: Average similarity score for all queries
# Avg Colmatch Simple: Average column match score for simple queries
# Avg Colmatch Medium: Average column match score for medium queries
# Avg Colmatch Complex: Average column match score for complex queries
# Avg Colmatch Total: Average column match score for all queries
# Input Tokens: Total input tokens used
# Output Tokens: Total output tokens generated
# Total Tokens: Total tokens used (input + output)
# Total Cost: Total cost incurred for the queries
# Total Time: Total time taken for all queries in H:M:S format

Unnamed: 0,GPT-4,GPT-3.5
experiment,SQLQueryChain - SHIPMENT - GPT-4,SQLQueryChain - SHIPMENT - GPT-3.5
simple,5,5
medium,6,6
complex,9,9
total,20,20
overall_simple,0.8,0.6
overall_medium,0.6667,0.8333
overall_complex,0.7778,0.4444
overall_total,0.75,0.6
avg_similarity_simple,0.98,0.92


In [None]:
# Export to CSV
# inverted_df.to_csv("inverted_evaluation_summary.csv", index=False)
