## Creating Structured Data

In [3]:
import os, re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
RESULTS_DIR = 'results'
EVAL_CONFIG = 'eval.config.json'

In [9]:
import json

with open(EVAL_CONFIG) as f:
    eval_config = json.load(f)

problem_set = {}
for difficulty in eval_config.keys():
    if difficulty not in problem_set:
        problem_set[difficulty] = []
    for problem in eval_config[difficulty].keys():
        problem_set[difficulty].append(problem)

In [10]:
problem_set

{'EASY': ['translation',
  'essay_reviewer',
  'joke_gen',
  'expert_answer',
  'odd_word_out'],
 'MEDIUM': ['mcq_reason', 'personality_finder', 'template', 'text_to_type'],
 'HARD': ['rpg_level_gen', 'wikipedia']}

In [80]:
ANALYSIS_DIFFICULTY = 'EASY' # 'EASY', 'MEDIUM', 'HARD'

def get_results(difficulty):
    results = {}
    for problem in problem_set[difficulty]:
        results[problem] = get_problem_results(problem)
    return results

def get_problem_results(problem):
    '''Input prompt, Output prompt, Token Usage, Output'''
    return {
        "jac": get_problem_result(problem, "jac"),
        "dspy": get_problem_result(problem, "dspy"),
    }

def get_problem_result(problem, impl="jac"):
    _output = {
        "llm_requests": [],
        "output": ""
    }
    
    file = f"{RESULTS_DIR}/{problem}/{impl}/results.txt"
    with open(file) as f:
        file_contents = f.read()
    
    while True:
        input_prompt_pattern = r'Input Prompt:\n(.*?)\nOutput:'
        input_prompt_match = re.search(input_prompt_pattern, file_contents, re.DOTALL)
        if input_prompt_match:
            input_prompt = input_prompt_match.group(1).strip()
        else:
            break

        output_pattern = r'Output:\n(.*?)\n\{' if impl == "dspy" else r'Output:\n(.*?)\nCompletionUsage'
        output_match = re.search(output_pattern, file_contents, re.DOTALL)
        if output_match:
            output = output_match.group(1).strip()

        file_contents = file_contents[output_match.end():]
        if impl == "dspy":
            slide = 0
            token_pattern = r"'completion_tokens': (\d+), 'prompt_tokens': (\d+), 'total_tokens': (\d+)}\n"
        else:
            slide = 1
            token_pattern = r"(completion_tokens=(\d+), prompt_tokens=(\d+), total_tokens=(\d+))"
        token_match = re.search(token_pattern, file_contents)
        if token_match:
            completion_tokens = int(token_match.group(1+slide))
            prompt_tokens = int(token_match.group(2+slide))
            total_tokens = int(token_match.group(3+slide))

        file_contents = file_contents[token_match.end():]

        _output["llm_requests"].append({
            "prompt": input_prompt,
            "output": output,
            "token_usage": {
                "completion_tokens": completion_tokens,
                "prompt_tokens": prompt_tokens,
                "total_tokens": total_tokens,
            }
        })

    _output["output"] = file_contents.strip()
    return _output

In [81]:
for difficulty in problem_set.keys():
    _results = get_results(difficulty)
    with open(f"{RESULTS_DIR}/{difficulty}.json", "w") as f:
        json.dump(_results, f, indent=4)

## Visualize Token Usage and Cost

In [77]:
def load_results(difficulty):
    with open(f"{RESULTS_DIR}/{difficulty}.json") as f:
        return json.load(f)

In [129]:
_results = load_results('EASY')

In [130]:
llm_costs = {
    "completion_tokens": 15/1e6,
    "prompt_tokens":5/1e6,
}

def get_results_df(results):
    rows = []
    for problem, result in results.items():
        jac = result["jac"]
        dspy = result["dspy"]
        rows.append({
            "problem": problem,
            "jac_completion_tokens": sum([llm["token_usage"]["completion_tokens"] for llm in jac["llm_requests"]]),
            "jac_prompt_tokens": sum([llm["token_usage"]["prompt_tokens"] for llm in jac["llm_requests"]]),
            "jac_total_tokens": sum([llm["token_usage"]["total_tokens"] for llm in jac["llm_requests"]]),
            "dspy_completion_tokens": sum([llm["token_usage"]["completion_tokens"] for llm in dspy["llm_requests"]]),
            "dspy_prompt_tokens": sum([llm["token_usage"]["prompt_tokens"] for llm in dspy["llm_requests"]]),
            "dspy_total_tokens": sum([llm["token_usage"]["total_tokens"] for llm in dspy["llm_requests"]]),
            "jac_cost": sum([llm["token_usage"]["completion_tokens"]*llm_costs["completion_tokens"] + llm["token_usage"]["prompt_tokens"]*llm_costs["prompt_tokens"] for llm in jac["llm_requests"]]),
            "dspy_cost": sum([llm["token_usage"]["completion_tokens"]*llm_costs["completion_tokens"] + llm["token_usage"]["prompt_tokens"]*llm_costs["prompt_tokens"] for llm in dspy["llm_requests"]]),
        })
    return pd.DataFrame(rows)



@kugesan will add graphs

## Time Taken Analysis

In [146]:
import pstats

p_dspy = pstats.Stats('/Users/chandralegend/Desktop/Jaseci/mtllm-evaluation/results/joke_gen/dspy/profile.prof')

In [147]:
p_dspy.sort_stats('cumulative').print_stats(30)

Thu Jun  6 01:12:09 2024    /Users/chandralegend/Desktop/Jaseci/mtllm-evaluation/results/joke_gen/dspy/profile.prof

         71020 function calls (66822 primitive calls) in 2.801 seconds

   Ordered by: cumulative time
   List reduced from 1130 to 30 due to restriction <30>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    300/1    0.000    0.000    2.801    2.801 /opt/conda/envs/mtllm-eval/lib/python3.12/importlib/__init__.py:73(import_module)
    301/1    0.000    0.000    2.801    2.801 <frozen importlib._bootstrap>:1375(_gcd_import)
    301/1    0.000    0.000    2.801    2.801 <frozen importlib._bootstrap>:1349(_find_and_load)
      2/1    0.000    0.000    2.801    2.801 <frozen importlib._bootstrap>:1304(_find_and_load_unlocked)
        2    0.000    0.000    2.801    1.400 <frozen importlib._bootstrap>:911(_load_unlocked)
        1    0.000    0.000    2.800    2.800 <frozen importlib._bootstrap_external>:989(exec_module)
      5/4    0.000    0.000 

<pstats.Stats at 0x128da4b00>

In [148]:
p_jac = pstats.Stats('/Users/chandralegend/Desktop/Jaseci/mtllm-evaluation/results/joke_gen/jac/profile.prof')

In [149]:
p_jac.sort_stats('cumulative').print_stats(30)

Thu Jun  6 01:12:09 2024    /Users/chandralegend/Desktop/Jaseci/mtllm-evaluation/results/joke_gen/jac/profile.prof

         95852 function calls (90266 primitive calls) in 1.300 seconds

   Ordered by: cumulative time
   List reduced from 1206 to 30 due to restriction <30>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      2/1    0.000    0.000    1.300    1.300 /opt/conda/envs/mtllm-eval/lib/python3.12/site-packages/jaclang/plugin/feature.py:74(jac_import)
      3/1    0.000    0.000    1.300    1.300 /opt/conda/envs/mtllm-eval/lib/python3.12/site-packages/jaclang/vendor/pluggy/_hooks.py:475(__call__)
      3/1    0.000    0.000    1.300    1.300 /opt/conda/envs/mtllm-eval/lib/python3.12/site-packages/jaclang/vendor/pluggy/_manager.py:106(_hookexec)
      3/1    0.000    0.000    1.300    1.300 /opt/conda/envs/mtllm-eval/lib/python3.12/site-packages/jaclang/vendor/pluggy/_callers.py:28(_multicall)
      2/1    0.000    0.000    1.300    1.300 /opt/conda/en

<pstats.Stats at 0x128d92d80>