## Creating Structured Data

In [3]:
import os, re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
RESULTS_DIR = 'results'
EVAL_CONFIG = 'eval.config.json'

In [9]:
import json

with open(EVAL_CONFIG) as f:
    eval_config = json.load(f)

problem_set = {}
for difficulty in eval_config.keys():
    if difficulty not in problem_set:
        problem_set[difficulty] = []
    for problem in eval_config[difficulty].keys():
        problem_set[difficulty].append(problem)

In [10]:
problem_set

{'EASY': ['translation',
  'essay_reviewer',
  'joke_gen',
  'expert_answer',
  'odd_word_out'],
 'MEDIUM': ['mcq_reason', 'personality_finder', 'template', 'text_to_type'],
 'HARD': ['rpg_level_gen', 'wikipedia']}

In [80]:
ANALYSIS_DIFFICULTY = 'EASY' # 'EASY', 'MEDIUM', 'HARD'

def get_results(difficulty):
    results = {}
    for problem in problem_set[difficulty]:
        results[problem] = get_problem_results(problem)
    return results

def get_problem_results(problem):
    '''Input prompt, Output prompt, Token Usage, Output'''
    return {
        "jac": get_problem_result(problem, "jac"),
        "dspy": get_problem_result(problem, "dspy"),
    }

def get_problem_result(problem, impl="jac"):
    _output = {
        "llm_requests": [],
        "output": ""
    }
    
    file = f"{RESULTS_DIR}/{problem}/{impl}/results.txt"
    with open(file) as f:
        file_contents = f.read()
    
    while True:
        input_prompt_pattern = r'Input Prompt:\n(.*?)\nOutput:'
        input_prompt_match = re.search(input_prompt_pattern, file_contents, re.DOTALL)
        if input_prompt_match:
            input_prompt = input_prompt_match.group(1).strip()
        else:
            break

        output_pattern = r'Output:\n(.*?)\n\{' if impl == "dspy" else r'Output:\n(.*?)\nCompletionUsage'
        output_match = re.search(output_pattern, file_contents, re.DOTALL)
        if output_match:
            output = output_match.group(1).strip()

        file_contents = file_contents[output_match.end():]
        if impl == "dspy":
            slide = 0
            token_pattern = r"'completion_tokens': (\d+), 'prompt_tokens': (\d+), 'total_tokens': (\d+)}\n"
        else:
            slide = 1
            token_pattern = r"(completion_tokens=(\d+), prompt_tokens=(\d+), total_tokens=(\d+))"
        token_match = re.search(token_pattern, file_contents)
        if token_match:
            completion_tokens = int(token_match.group(1+slide))
            prompt_tokens = int(token_match.group(2+slide))
            total_tokens = int(token_match.group(3+slide))

        file_contents = file_contents[token_match.end():]

        _output["llm_requests"].append({
            "prompt": input_prompt,
            "output": output,
            "token_usage": {
                "completion_tokens": completion_tokens,
                "prompt_tokens": prompt_tokens,
                "total_tokens": total_tokens,
            }
        })

    _output["output"] = file_contents.strip()
    return _output

In [81]:
for difficulty in problem_set.keys():
    _results = get_results(difficulty)
    with open(f"{RESULTS_DIR}/{difficulty}.json", "w") as f:
        json.dump(_results, f, indent=4)

## Visualize Token Usage and Cost

In [172]:
def load_results(difficulty):
    with open(f"{RESULTS_DIR}/{difficulty}.json") as f:
        return json.load(f)

In [173]:
_results = load_results('EASY')

In [174]:
llm_costs = {
    "completion_tokens": 15/1e6,
    "prompt_tokens":5/1e6,
}

def get_results_df(results):
    rows = []
    for problem, result in results.items():
        jac = result["jac"]
        dspy = result["dspy"]
        rows.append({
            "problem": problem,
            "jac_completion_tokens": sum([llm["token_usage"]["completion_tokens"] for llm in jac["llm_requests"]]),
            "jac_prompt_tokens": sum([llm["token_usage"]["prompt_tokens"] for llm in jac["llm_requests"]]),
            "jac_total_tokens": sum([llm["token_usage"]["total_tokens"] for llm in jac["llm_requests"]]),
            "dspy_completion_tokens": sum([llm["token_usage"]["completion_tokens"] for llm in dspy["llm_requests"]]),
            "dspy_prompt_tokens": sum([llm["token_usage"]["prompt_tokens"] for llm in dspy["llm_requests"]]),
            "dspy_total_tokens": sum([llm["token_usage"]["total_tokens"] for llm in dspy["llm_requests"]]),
            "jac_cost": sum([llm["token_usage"]["completion_tokens"]*llm_costs["completion_tokens"] + llm["token_usage"]["prompt_tokens"]*llm_costs["prompt_tokens"] for llm in jac["llm_requests"]]),
            "dspy_cost": sum([llm["token_usage"]["completion_tokens"]*llm_costs["completion_tokens"] + llm["token_usage"]["prompt_tokens"]*llm_costs["prompt_tokens"] for llm in dspy["llm_requests"]]),
        })
    return pd.DataFrame(rows)

In [175]:
results_df = get_results_df(_results)

@kugesan will add graphs

## Time Taken Analysis

In [176]:
import pstats

def get_time(problem, impl):
    p_dspy = pstats.Stats(f'results/{problem}/{impl}/profile.prof')
    total_time = p_dspy.total_tt
    llm_time = p_dspy.sort_stats('cumulative').stats[('/opt/conda/envs/mtllm-eval/lib/python3.12/site-packages/openai/_utils/_utils.py', 243, 'wrapper')][3]
    return total_time, llm_time

In [182]:
results_df_with_time = results_df.copy()
results_df_with_time['jac_total_time'] = results_df_with_time['problem'].apply(lambda x: get_time(x, 'jac')[0])
results_df_with_time['jac_llm_time'] = results_df_with_time['problem'].apply(lambda x: get_time(x, 'jac')[1])
results_df_with_time['dspy_total_time'] = results_df_with_time['problem'].apply(lambda x: get_time(x, 'dspy')[0])
results_df_with_time['dspy_llm_time'] = results_df_with_time['problem'].apply(lambda x: get_time(x, 'dspy')[1])
results_df_with_time["jac_runtime"] = results_df_with_time["jac_total_time"] - results_df_with_time["jac_llm_time"]
results_df_with_time["dspy_runtime"] = results_df_with_time["dspy_total_time"] - results_df_with_time["dspy_llm_time"]
results_df_with_time["jac_runtime_percent"] = results_df_with_time["jac_runtime"] / results_df_with_time["jac_total_time"]
results_df_with_time["dspy_runtime_percent"] = results_df_with_time["dspy_runtime"] / results_df_with_time["dspy_total_time"]

In [183]:
results_df_with_time

Unnamed: 0,problem,jac_completion_tokens,jac_prompt_tokens,jac_total_tokens,dspy_completion_tokens,dspy_prompt_tokens,dspy_total_tokens,jac_cost,dspy_cost,jac_total_time,jac_llm_time,dspy_total_time,dspy_llm_time,jac_runtime,dspy_runtime,jac_runtime_percent,dspy_runtime_percent
0,translation,4,178,182,54,320,374,0.00095,0.00241,1.701438,0.788278,7.607343,5.113171,0.91316,2.494172,0.536699,0.327864
1,essay_reviewer,267,1512,1779,750,1407,2157,0.011565,0.018285,7.405499,6.968499,26.674179,22.303699,0.437,4.37048,0.05901,0.163847
2,joke_gen,19,176,195,59,191,250,0.001165,0.00184,1.300078,1.239698,2.800871,2.776441,0.06038,0.02443,0.046444,0.008722
3,expert_answer,113,290,403,159,111,270,0.003145,0.00294,4.345821,4.022049,3.917878,3.908516,0.323772,0.009362,0.074502,0.00239
4,odd_word_out,28,257,285,274,547,821,0.001705,0.006845,1.185508,1.114505,9.897317,7.919308,0.071003,1.978008,0.059892,0.199853
