# Imports

In [None]:
import os
from tools.basic import  map_label, SUBKEY_TO_PARENT, group_root_subkeys_under_correct_parents

from typing import Any, Dict
from definitions.coding_manuals import  system_prompt_whole
from definitions.instructions import inference_instructions_whole
from definitions.models import AttributionResponse
import json
from tqdm import tqdm
import time
import asyncio
from tqdm import tqdm
import nest_asyncio
import matplotlib.pyplot as plt
from src.dependencies import client
from src.reflexion import process_entry_with_reflect
from src.experts import process_entry
from src.utils import merge_multiple_ai_runs, comparison_mode, matrix, calculate_f1_scores, calculate_f1_scores_from_path

# Set up environment, client and data

In [None]:
nest_asyncio.apply()

with open("data/human_coding/human_coding_with_transcript.json", "r", encoding="utf-8") as f:
    human_coding_with_transcript = json.load(f)

categories = ['Int_U', 'Int_D','Con_UU', 'Con_CR', 'LOC_E', 'LOC_IGL', 'LOC_IBU', 'Perm_FC', 'Perm_SU']
output_folder = "ai_json_output"

# RUN A: Baseline

## Elaboration

In Run A, for each transcript, we run inference using PASS manual 3 times each at temperature 1 (only temp valid) for o4-mini-2025-04-16 to account for non-deterministic nature of responses.


## Execution

In [None]:
model_name = "o4-mini-2025-04-16"
temp = 1
run_segment = 'A'
runs = 3

print(f"Running with temperature: {temp}")

run_id = f"Run_{run_segment}_model_{model_name}_temp_{temp}"
for index in  tqdm(range(len(human_coding_with_transcript))):
      entry = human_coding_with_transcript[index]
      file_name = f"{entry['subject_code']}.json"

      file_folder = f"{output_folder}/{run_id}"
      os.makedirs(file_folder, exist_ok=True)

      file_path = f"{file_folder}/{file_name}"
      if file_name in os.listdir(file_folder):
            continue
      
      run_compiled = []

      for run_index in range(runs):
            completion = client.chat.completions.parse(
                  model=model_name,
                  messages=[
                        {"role": "system",
                        "content": system_prompt_whole
                              },
                              
                        {"role": "user",
                        "content": "Test Case Interview Transcript: \n"+ entry['transcript']
                              },

                        {"role": "user",
                              "content":  "INSTRUCTIONS: "+inference_instructions_whole

                        },
                  ],
                  response_format=AttributionResponse,
                  temperature=temp,
                  top_p=1,
                  presence_penalty=0,
                  frequency_penalty=0,
                  seed=42)
            
            time.sleep(1)
            run_compiled.append(json.loads(completion.choices[0].message.content))

      with open(file_path, "w", encoding="utf-8") as f:
            json.dump(run_compiled, f, indent=2, ensure_ascii=False)
            time.sleep(10)

## Calculate F1

In [None]:
temp = 1
model_name = "o4-mini-2025-04-16"
run_segment = 'A'
run_id = f"Run_{run_segment}_model_{model_name}_temp_{temp}"
path = f"{output_folder}/{run_id}"

run_A_f1 = calculate_f1_scores_from_path(human_coding_with_transcript, path, 50)

In [None]:
run_A_f1

## Conclusion

baseline run o4-mini-2025-04-16 at temp 1 yields 'micro_f1': 0.442

# RUN B: Split by Experts

## Elaboration

In Run B, for each transcript, we run inference using PASS manual 3 times each at temperature 1 for o4-mini-2025-04-16.
- Instead of running inference of all dimensions at once on each call, each call only focuses on 1 dimension. Results are aggregated.


## Execution

In [None]:
model_name = "o4-mini-2025-04-16"
temp = 1
runs = 3
run_id = f"Run_B_model_{model_name}_temp_{temp}_experts"

# ---- Run ----
async def main():
    for index in tqdm(range(len(human_coding_with_transcript))):
        entry = human_coding_with_transcript[index]
        #print(entry['subject_code'])
        file_name = f"{entry['subject_code']}.json"
        file_folder = f"{output_folder}/{run_id}"
        os.makedirs(file_folder, exist_ok=True)
        file_path = os.path.join(file_folder, file_name)

        if os.path.exists(file_path):
            #print(f"Skipping {file_name}, already exists.")
            continue

        await process_entry(entry, run_id, model_name, temp, file_path, runs)
        await asyncio.sleep(60)

asyncio.run(main())

## Calculate F1

In [None]:
temp = 1
model_name = "o4-mini-2025-04-16"
run_segment = 'B'
run_id = f"Run_{run_segment}_model_{model_name}_temp_{temp}_experts"
path = f"{output_folder}/{run_id}"

run_B_f1 = calculate_f1_scores_from_path(human_coding_with_transcript, path, 50)

In [None]:
run_B_f1

## Conclusion

Splitting by experts yields improvement of 'micro_f1': 0.536 compared to baseline of 'micro_f1': 0.442

# RUN C: Split by Experts + React

## Elaboration

In Run C, each expert's response on a transcript has 2 rounds of reflection and revision. Flag attribution misclassifications, Suggest additions/removals, Ensure inclusion/exclusion criteria are met. Output from 2nd revision used as final.

## Execution

In [None]:
model_name = "o4-mini-2025-04-16"
temp = 1
run_id = f"Run_C_model_{model_name}_temp_{temp}_experts_reflexion"

async def main():
    for index in tqdm(range(len(human_coding_with_transcript))):
        entry = human_coding_with_transcript[index]
        #print(entry['subject_code'])
        file_name = f"{entry['subject_code']}.json"
        file_folder = f"{output_folder}/{run_id}"
        os.makedirs(file_folder, exist_ok=True)
        file_path = os.path.join(file_folder, file_name)

        if os.path.exists(file_path):
            #print(f"Skipping {file_name}, already exists.")
            continue

        await process_entry_with_reflect(entry, run_id, model_name, temp, False)
        await asyncio.sleep(60)

asyncio.run(main())

## Calculate F1

In [None]:
temp = 1
model_name = "o4-mini-2025-04-16"
run_id = f"Run_C_model_{model_name}_temp_{temp}_experts_reflexion"
path = f"{output_folder}/{run_id}"

run_C_f1 = calculate_f1_scores_from_path(human_coding_with_transcript, path, 50)

In [None]:
run_C_f1

## Conclusion

Reflexion yields improvement of 'micro_f1': 0.565 compared to previous best of 'micro_f1': 0.536.

# Run D: with retrieval of references

## Elaboration

Prior to 1st reflection, insert 3 most similiar transcript by embeddings and include human codings as few shot examples

## Execution

In [None]:
model_name = "o4-mini-2025-04-16"
temp = 1
run_id = f"Run_D_model_{model_name}_temp_{temp}_experts_reflexion_with_references"

async def main():
    for index in tqdm(range(len(human_coding_with_transcript))):
        entry = human_coding_with_transcript[index]
        #print(entry['subject_code'])
        file_name = f"{entry['subject_code']}.json"
        file_folder = f"{output_folder}/{run_id}"
        os.makedirs(file_folder, exist_ok=True)
        file_path = os.path.join(file_folder, file_name)

        if os.path.exists(file_path):
            #print(f"Skipping {file_name}, already exists.")
            continue

        await process_entry_with_reflect(entry, run_id, model_name, temp, True)
        await asyncio.sleep(60)

asyncio.run(main())

## Calculate F1

In [None]:
temp = 1
model_name = "o4-mini-2025-04-16"
run_id = f"Run_D_model_{model_name}_temp_{temp}_experts_reflexion_with_references"
path = f"{output_folder}/{run_id}"

run_D_f1 = calculate_f1_scores_from_path(human_coding_with_transcript, path, 50)

In [None]:
run_D_f1

## Conclusion

References yields improvement of 'micro_f1': 0.572 compared to previous best of 'micro_f1': 0.565.

# Graph comparison with Chatgpt Interface results

In [None]:
out = {}
paths = [["chatgpt","ai_json_output/chatgpt_baseline"], ["o4-mini","ai_json_output/Run_D_model_o4-mini-2025-04-16_temp_1_experts_reflexion_with_references"]]
for threshold in [95, 90, 85, 80, 75, 70, 65, 60, 55, 50]:
    out[str(threshold)] = {}
    for path_info in paths:
        all_matrixes = []
        path = path_info[1]
        for oneJson in os.listdir(path):
            if oneJson.split(".")[0] in ["C1012M", "C678M", "C642F", "EARC003M", "EARC004M", "C639M", "C616M",
        "EARC002M", "EARC006M", "C1000F", "C1000M", "C1036F", "C1036M", "C1031F",
        "C1031M", "C1009F", "C1057F", "C1057M", "C662M", "C629M", "C625M", "C613F",
        "C613M", "EARC011M", "EARC013F", "EARC013M", "EARC014M", "EARC027M",
        "U165F", "U165M", "EARC092M", "EARC116M"]:
                continue
            pathfile = f"{path}/{oneJson}"

            if '.json' not in oneJson:
                continue
            with open(pathfile, "r", encoding="utf-8") as f:
                inference = json.load(f)

            
            if path_info[0]=='o4-mini':
                ai_entry = comparison_mode(merge_multiple_ai_runs(inference))
            else:
                ai_entry = (inference)

            human_entry = [a for a in human_coding_with_transcript if a["subject_code"] ==oneJson.split(".")[0] ][0]

            one_entry_matrixes = {}

            for acat in categories:
                acat_human = 'Human_'+acat
                acat_ai = 'AI_'+acat
                one_entry_matrixes[acat] = matrix(human_entry[acat_human], ai_entry[acat_ai], threshold )

            all_matrixes.append(one_entry_matrixes)
        
        out[str(threshold)][path_info[0]] = calculate_f1_scores(all_matrixes)

In [None]:
# Reverse the x-axis to go from 100 to 50

x_reversed = sorted(out.keys(), key=lambda k: int(k), reverse=True)
chat_macro = [out[k]['chatgpt']['macro_f1'] for k in x_reversed]
chat_micro = [out[k]['chatgpt']['micro_f1'] for k in x_reversed]
agent_macro = [out[k]['o4-mini']['macro_f1'] for k in x_reversed]
agent_micro = [out[k]['o4-mini']['micro_f1'] for k in x_reversed]

# Plotting with reversed x-axis
plt.figure(figsize=(12, 6))
plt.plot(x_reversed, chat_macro, label='Chat Interface - Macro F1', marker='o')
plt.plot(x_reversed, chat_micro, label='Chat Interface - Micro F1', marker='o')
plt.plot(x_reversed, agent_macro, label='Approach D - Macro F1', marker='s')
plt.plot(x_reversed, agent_micro, label='Approach D - Micro F1', marker='s')

plt.xlabel("Threshold")
plt.ylabel("F1 Score")
plt.title("Macro and Micro F1 Scores by Threshold (Descending)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=45)

plt.show()