# LLM Workflow

### Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

import os
import requests
import json
import glob

# import helper functions from helper functions notebook
%run helper_functions.ipynb

In [None]:
AUTHORIZATION_TOKEN = os.getenv('AUTHORIZATION_TOKEN')

#### Define Global Variables

In [None]:
# Define which models we are using
engines = {
    "sonnet-4": "us.anthropic.claude-sonnet-4-20250514-v1:0",
    "gpt-5": "gpt-5",
    "gpt-41-mini": "gpt-4.1-mini",
}

reasoning_cols = ["Engagement with Evidence", "Goal Orientation",
                  "Collaborative Decision-Making", "Strategic Planning"]
hire_cols = ["Cultural Responsiveness", "Parent/Community Engagement", "Academic Achievement", 
                "Candidate Experience/Expertise", "Evaluation", "School Culture Fit"]

cols =  reasoning_cols + hire_cols

model_order = [
    "GPT-5",
    "GPT-4.1 mini",
    "Claude Sonnet 4",
]
color_map = {
    "GPT-5": "#426737",
    "GPT-4.1 mini": "#bab97d",
    "Claude Sonnet 4": "#f9b40d",
}

## Step 1: Prepare Rubrics for Human Scorers and LLMs

In [None]:
# Load rubrics from Amazon S3 bucket
rubric_map = {
    "zero-shot": ["jacob.m.rubin@vanderbilt.edu/2025-12-16/c4eba566-840f-4ee1-9158-beb03b013b6f.json"],
    "examples": ["jacob.m.rubin@vanderbilt.edu/2025-12-17/da43579e-2f77-4aeb-90e3-4015ba9a75d8.json"],
    "cot_examples": ["jacob.m.rubin@vanderbilt.edu/2025-12-17/b28011bf-d14c-46d7-8584-be51360d4ecf.json"]
}


In [None]:
# Build sample payload from Vanderbilt Amplify

url = "https://prod-api.vanderbilt.ai/chat"

def build_payload(message_content, prompt="Case Study Scoring", model_engine=engines["gpt-5"], codebook_type="zero-shot"):
    data_sources = rubric_map[codebook_type]

    payload = {
        "data": {
            "model": model_engine,
            "temperature": 0,
            "max_tokens": 4096,
            "dataSources": data_sources,
            "messages": [
                {
                    "role": "user",
                    "content": message_content
                }
            ],
            "options": {
                "ragOnly": False,
                "skipRag": True,
                "model": {
                    "id": model_engine
                },
                "assistantId":"",
                "prompt": prompt
            }
        }
    }

    return json.dumps(payload)


### Verify Rubrics are Uploaded

In [None]:
# Example API call
payload = build_payload("What is the name of the document uploaded? Provide the full text that scores a 2 on Collaborative Decision-Making.", prompt="Testing", model_engine=engines["gpt-5"], codebook_type="zero-shot")

headers = {
  'Content-Type': 'application/json',
  'Authorization': f"Bearer {AUTHORIZATION_TOKEN}"
}

# response = requests.request("POST", url, headers=headers, data=payload)
# response.text

In [None]:
# Ask if the LLM knows what the uploaded codebook is
content_message = "Provide the full list of labels that you will score with using the Rubric Word Document."

payload = build_payload(content_message, prompt="Testing file name", model_engine=engines["gpt-5"], codebook_type="zero-shot")

# response = requests.request("POST", url, headers=headers, data=payload)

# response_json = response.json()
# raw_json = response_json["data"]
# raw_json

In [None]:
content_message = "Using the Rubric document, what are the possible values to score for Cultural Responsiveness? What about Engagement with Evidence?"

payload = build_payload(content_message, prompt="Testing cultural responsiveness", model_engine=engines["gpt-41-mini"])

# response = requests.request("POST", url, headers=headers, data=payload)

# response_json = response.json()
# raw_json = response_json["data"]
# raw_json

In [None]:
content_message = "Using the Rubric document, give me the full text of the examples under the category Strategic Planning."

payload = build_payload(content_message, prompt="Testing", model_engine=engines["gpt-41-mini"], codebook_type="cot_examples")

# response = requests.request("POST", url, headers=headers, data=payload)

# response_json = response.json()
# raw_json = response_json["data"]
# raw_json

## Step 2: Split the Dataset and Test LLMs' Basic Capabilities

### Split Dataset

In [None]:
# Read in response data
df = pd.read_csv('../../data/Case Study/case_study_responses.csv')
df = df[df['response'].notna() & (df['response'].str.strip() != "")]
df = df[["record_id", "response"]]
df.shape

In [None]:
# Define train and test datasets
SEED = 42

# Take a random 75% sample for the test set
test_df = df.sample(frac=0.75, random_state=SEED)

# Use the remaining 25% for the training set
train_df = df.drop(test_df.index)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

### Calculate Number of Scoring Iterations

#### Define Prompt

In [None]:
content_noconf = '''
    Output the assessments using JSON with the following format:
    [
        {{"record_id": "<unique id for response>", "Scorer Name": "{engine}",
            "Goal Orientation": "<response>", "Engagement with Evidence": "<response>", ..., "Cultural Responsiveness": "<response>", ...
        }}, 
        ...
    ]
    '''

content_conf = '''
    Please give a confidence score on a scale of 0 to 1 for each predicted score.
    
    Output the assessments using JSON with the following format:
    [
        {{"record_id": "<unique id for response>", "Scorer Name": "{engine}",
            "Goal Orientation": "<response>", "Confidence Score for Goal Orientation": "<response>", "Engagement with Evidence": "<response>", "Confidence Score for Engagement with Evidence": "<response>", ...  
        }}, 
        ...
    ]
    '''

In [None]:
def build_content_message(response_list, engine="gpt-5", hasExamples="", content_output_structure=content_noconf):
    content_output_structure = content_output_structure.format(engine=engine)

    content_message = f'''I will provide you with a series of responses to a question prompt, and you will use the instructions from the Rubric Word Document to score it with the provided labels. Here is the prompt:
    "It is the beginning of your first school year as principal of Booker T. Washington High/Elementary School, which has an enrollment of 1003/302 students. Below, we will ask you about your approach to leading in three different areas for this first school year. To help you answer these questions, the district has compiled some information about the school for you in an Excel spreadsheet. The spreadsheet contains the following: The first tab is a Staffing Report with summary information about your teachers. For simplicity, it only includes data on math and ELA / K-3 teachers. The next eight tabs, colored blue, contain short summaries from last year's evaluation information for each math and ELA / K-3 teacher. The remaining tabs summarize:
    Student achievement data, Student enrollment data, Data from recent teacher and student surveys, and Data on teacher turnover. 
    
    Two of your teachers have already indicated that they probably will leave at the end of this school year. Patrick Houser, a Geometry/1st grade teacher, has said he intends to retire. Barbara Nelson, an English III/3rd grade teacher, plans to move out of state to be closer to her aging parents. It's too early to know about others. 
    Given enrollment projections, you also anticipate needing an additional English/Kindergarten teacher in the following school year. 
    Given what you know now, describe how you would approach teacher retention this year. Then, begin mapping out an approach to teacher hiring and placement for the following year. Discuss the reasoning behind your approach: 
    What are your goals? How are you achieving those goals?"

    The responses are in a dictionary object with the format <record_id> : <response>. Here are the responses:
    {response_list}

    {content_output_structure}
    
    The labels to output are defined with "Label" in the rubric document. For example, "Label: Goal Orientation", "Label: Engagement with Evidence", etc.
    The instructions on how to score each label are provided in the uploaded codebook document. For example, for the "Goal Orientation" item, you will use the "Instructions: Only score 1 through 4. Score a 4 if respondent identifies goals...". So the value for the "Goal Orienation" item will only be a number between 1 and 4 (never 0 or greater than 4).
    {hasExamples}
    Only output JSON. Do not output anything else. 
    '''
    return content_message

#### Define API Call to Amplify

In [None]:
def call_amplify_api(df=train_df, engine_key="gpt-5", iterations=1, chunk_size=8, 
                     call_type="by_item", file_ext="05142025", codebook_type="zero-shot", 
                     hasConfidenceScores=False):
    # Define headers
    headers = {
      'Content-Type': 'application/json',
      'Authorization': f"Bearer {AUTHORIZATION_TOKEN}"
    }
    
    all_dfs = []
    # Make API calls
    engine = engines[engine_key]
    
    # Number of scoring iterations
    for j in range(0, iterations):
        
        # Remove rows where response is nan or an empty string
        df = df[df.iloc[:, 1].notna() & (df.iloc[:, 1].astype(str).str.strip() != "")]
        # Randomize row order
        df = df.sample(frac=1).reset_index(drop=True)
    
        # Randomize prompt name
        random_number = random.random()
        prompt = f'Testing Prompt: {random_number}'
        print("Prompt: ", prompt, "Call iteration: ", j)
        
        # Breaking up scoring because of JSON token limitations
        for i in range(0, len(df)//chunk_size + 1):
    
            print("Scoring responses: ", i*chunk_size, "through", i*chunk_size+chunk_size)
        
            # API only allows max 4096 tokens, so only grade chunk_size responses at a time.
            sample_df = df[chunk_size*i:chunk_size*i+chunk_size]    
            data_dict = dict(zip(sample_df.iloc[:, 0], sample_df.iloc[:, 1]))

            # Build prompt message
            content = build_content_message(response_list=data_dict, engine=engine, hasExamples=(
                "Use the examples to help guide scoring." if codebook_type != "zero-shot" else ""),
                                            content_output_structure=(content_conf if hasConfidenceScores 
                                                                      else content_noconf),
                                           )
            
            payload = build_payload(content, prompt=prompt, model_engine=engine, codebook_type=codebook_type)

            try: 
                # Make API call
                response = requests.request("POST", url, headers=headers, data=payload)
            
                # Load json response into a dataframe
                response_json = response.json()
                raw_json = response_json["data"]
                cleaned_json = raw_json.strip('```json\n').strip('```')
                cleaned_json = cleaned_json.replace('\\_', '_')

                if not raw_json:  # Empty or missing
                    print(f"Warning: Empty 'data' field in response on call {j}, chunk {i}")
                    continue
                
                try:
                    data = json.loads(cleaned_json)          
                    llm_df = pd.DataFrame(data)
                    llm_df['call_number'] = j  # Add call number column
                    llm_df['prompt_name'] = prompt  # Add prompt name column
                    all_dfs.append(llm_df)
                
                except Exception as e:
                    print(f"Error parsing JSON on call number {j}: {e}")
                    print("Response JSON: ", response_json)
                    continue  # Skip this iteration of the for loop
            
            except Exception as e:
                print(f"Error on request or response parsing at call {j}, chunk {i}: {e}")
                continue

    # Make the directory path if it doesn't exist
    output_dir = f"{call_type}/{engine_key}"
    os.makedirs(output_dir, exist_ok=True)
    
    # Build base file path
    base_filename = f"{engine_key}_{file_ext}.csv"
    file_path = os.path.join(output_dir, base_filename)

    # If file exists, add _1, _2, _3...
    counter = 1
    while os.path.exists(file_path):
        file_path = os.path.join(output_dir, f"{engine_key}_{file_ext}_{counter}.csv")
        counter += 1
    
    # Save file
    final_df = pd.concat(all_dfs, ignore_index=True)
    final_df.to_csv(file_path, index=False)

    return all_dfs

#### Call API for Training Data

In [None]:
codebook_types = ['zero-shot']

engines_to_run = {
    # "sonnet-4": "us.anthropic.claude-sonnet-4-20250514-v1:0",
    # "gpt-5": "gpt-5",
    # "gpt-41-mini": "gpt-4.1-mini",
}

for codebook_type in codebook_types:
    for engine_name, engine_key in engines_to_run.items():
        print(f"Running {engine_name} with {codebook_type}...")
        res = call_amplify_api(
            df=train_df,
            engine_key=engine_name,
            iterations=50,
            chunk_size=20,
            call_type=f'scores/train/{codebook_type}',
            file_ext=f"12162025",
            codebook_type=codebook_type
        )

#### Create RMSE Curves

In [None]:
gpt_5_df = concat_raw_scores("gpt-5", path=f'scores/training_data/zero-shot/')
sonnet_4_df = concat_raw_scores("sonnet-4", path=f'scores/training_data/zero-shot/')
gpt_41_mini_df = concat_raw_scores("gpt-41-mini", path=f'scores/training_data/zero-shot/')

training_df = pd.concat([gpt_5_df, sonnet_4_df, gpt_41_mini_df])

# Convert values to numerics
training_df = convert_vals_numeric(training_df)

training_df = handle_invalid_values(training_df)

In [None]:

# --- Configuration to create the RMSE curve ---
model_name_map = {
    "us.anthropic.claude-sonnet-4-20250514-v1:0": 'Claude Sonnet 4',
    "gpt-4.1-mini": "GPT-4.1 mini",
    "gpt-5": "GPT-5",
}

reasoning_cols = ["Engagement with Evidence", "Goal Orientation",
                  "Collaborative Decision-Making", "Strategic Planning"]
hire_cols = ["Cultural Responsiveness", "Parent/Community Engagement", "Academic Achievement", 
                "Candidate Experience/Expertise", "Evaluation", "School Culture Fit"]

column_groups = {
    "Reasoning": reasoning_cols,
    "Hiring Priorities": hire_cols
}

# Parameters
max_sample = 50
n_iterations = 50

# Map model names
training_df["Scorer Name Clean"] = training_df["Scorer Name"].map(model_name_map)

llms = ['GPT-5', 
        'GPT-4.1 mini',
        'Claude Sonnet 4',
       ]

# --- Function to compute RMSE curve for one column ---
def compute_rmse_curve(values, max_sample=50, n_iterations=50):
    """Compute RMSE vs. sub-sample size for a single column."""
    values = values[~np.isnan(values)]
    overall_mean = np.nanmean(values)
    rmse_curve = []
    for sample_size in range(1, max_sample + 1):
        sample_means = []
        for _ in range(n_iterations):
            sample = np.random.choice(values, size=sample_size, replace=True)
            sample_means.append(np.mean(sample))
        rmse = np.sqrt(np.mean((np.array(sample_means) - overall_mean) ** 2))
        rmse_curve.append(rmse)
    return np.array(rmse_curve)

# --- Compute RMSE curves by LLM and group ---
results_by_group = {}

for group_name, cols in column_groups.items():
    group_results = {llm: [] for llm in llms}
    
    for llm in llms:
        df_llm = training_df[training_df["Scorer Name Clean"] == llm]
        
        # Compute RMSE curve for each column, then average across columns
        item_curves = []
        for col in cols:
            if col in df_llm.columns:
                values = df_llm[col].dropna().values
                if len(values) > 0:
                    item_curves.append(compute_rmse_curve(values, max_sample, n_iterations))
        
        # Average RMSE curves (if at least one item present)
        if item_curves:
            mean_curve = np.nanmean(item_curves, axis=0)
            group_results[llm] = mean_curve
        else:
            group_results[llm] = np.full(max_sample, np.nan)
    
    results_by_group[group_name] = group_results

In [None]:
from kneed import KneeLocator

# --- Plot three subplots side-by-side ---
fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharey=True)

elbows_summary = {}  # store elbow per group per LLM

for ax, (group_name, rmse_results) in zip(axes, results_by_group.items()):
    elbows_summary[group_name] = {}
    
    for llm in llms:
        y = rmse_results[llm]
        x = np.arange(1, max_sample + 1)

        # Plot RMSE curve
        ax.plot(x, y, color=color_map[llm], label=llm, linewidth=2)

        # Detect elbow (only if no NaNs and curve is valid)
        if np.all(np.isfinite(y)) and len(np.unique(y)) > 3:
            kneedle = KneeLocator(
                x, y, curve="convex", direction="decreasing",
                
            )
            elbow = kneedle.knee

            if elbow is not None:
                elbows_summary[group_name][llm] = elbow
                ax.axvline(elbow, color=color_map[llm], linestyle="--", alpha=0.6)
                ax.text(
                    elbow, np.nanmin(y) + (np.nanmax(y) - np.nanmin(y)) * 0.05,
                    f"{elbow:.0f}", color=color_map[llm],
                    rotation=90, va="bottom", ha="right", fontsize=8
                )
            else:
                elbows_summary[group_name][llm] = np.nan
        else:
            elbows_summary[group_name][llm] = np.nan
    
    ax.set_title(group_name)
    ax.set_xlabel("# of Scoring Iterations")
    ax.grid(True, alpha=0.3)

axes[0].set_ylabel("Average RMSE")
axes[-1].legend(title="Model", bbox_to_anchor=(1.05, 1), loc='upper left')

# plt.suptitle("Average RMSE vs. Sub-sample Size Across Rubric Categories", fontsize=14)
plt.tight_layout(rect=[0, 0, 0.95, 0.95])
plt.savefig("./output/training/RMSE_iterations_plot_avg_rmse_by_group_with_kneedle.png", dpi=300, bbox_inches='tight')
plt.show()


elbows_df = pd.DataFrame(elbows_summary).T
display(elbows_df)

#### Concat Training Dataframe

In [None]:
dataset = 'training_data'

gpt_5_df = concat_raw_scores("gpt-5", path=f'scores/{dataset}/zero-shot/')
sonnet_4_df = concat_raw_scores("sonnet-4", path=f'scores/{dataset}/zero-shot/')
gpt_41_mini_df = concat_raw_scores("gpt-41-mini", path=f'scores/{dataset}/zero-shot/')

iters_df = pd.concat([gpt_5_df, sonnet_4_df, gpt_41_mini_df])
print('Number of unique records in testing_df', len(gpt_5_df['record_id'].unique()))

# # Convert values to numerics
iters_df = convert_vals_numeric(iters_df)
    
# # Handle invalid values
iters_df = handle_invalid_values(iters_df)

# filtered_df = concat_llm_human_df(iters_df, is_training_data=True)
# filtered_df.to_csv('./scores/filtered/training_data_df.csv', index=False)
# filtered_df.shape

## Step 3: Run Zero-Shot, Few-Shot, In-Context-Learning (ICL) Scoring

### Test Set Scoring

In [None]:
codebook_types = ['zero-shot', 'examples', 'cot_examples']

engines_to_run = {
    # "sonnet-4": "us.anthropic.claude-sonnet-4-20250514-v1:0",
    # "gpt-41-mini": "gpt-4.1-mini",
    # "gpt-5": "gpt-5",
}

for codebook_type in codebook_types:
    for engine_name, engine_key in engines_to_run.items():
        print(f"Running {engine_name} with {codebook_type}...")
        res = call_amplify_api(
            df=test_df,
            engine_key=engine_name,
            iterations=9,
            chunk_size=20,
            call_type=f'scores/{codebook_type}',
            file_ext=f"12172025",
            codebook_type=codebook_type
        )


### Generate Uncertainty Scores Scoring

In [None]:
codebook_types = ['zero-shot', 'examples', "cot_examples"]

engines_to_run = {
    # "sonnet-4": "us.anthropic.claude-sonnet-4-20250514-v1:0",
    # "gpt-41-mini": "gpt-4.1-mini",
    # "gpt-5": "gpt-5",
}

for engine_name, engine_key in engines_to_run.items():
    for codebook_type in codebook_types:
        print(f"Running {engine_name} with {codebook_type}...")
        res = call_amplify_api(
            df=test_df,
            engine_key=engine_name,
            iterations=1,
            chunk_size=10,
            call_type=f'scores/uncertainty/{codebook_type}',
            file_ext=f"12172025",
            codebook_type=codebook_type,
            hasConfidenceScores=True
        )


## Step 4: Evaluate Compliance, Variation, Consistency, Uncertainty

In [None]:

def evaluation_workflow(codebook_type, valid_ids, calc_compliance=False):
    # Concat Scores Data from all models
    gpt_5_df = concat_raw_scores("gpt-5", path=f'scores/{codebook_type}')
    sonnet_4_df = concat_raw_scores("sonnet-4", path=f'scores/{codebook_type}')
    gpt_41_mini_df = concat_raw_scores("gpt-41-mini", path=f'scores/{codebook_type}')

    # Combine all data into singular dataframe
    testing_df = pd.concat([gpt_5_df, sonnet_4_df,  gpt_41_mini_df])
    testing_df = testing_df[~testing_df['record_id'].isin(training_ids)]

    print('Number of unique records in testing_df', len(testing_df['record_id'].unique()))

    # Convert values
    testing_df = convert_vals_numeric(testing_df)

    # Evaluate Compliance
    compliance_df = []
    if calc_compliance:
        compliance_df = run_compliance_checks(testing_df, valid_ids, codebook_type)

    # Handle invalid values
    testing_df = handle_invalid_values(testing_df)

    # Evaluate Variation
    variation_df = []
    variation_df = save_variation_df(testing_df, codebook_type)

    # Evaluate Uncertainty
    uncertainty_df = compute_entropy_pivot(
        testset_df=build_testset_df(
            codebook_types=["zero-shot", "examples", "cot_examples"],
            training_ids=training_ids,
        ),
        rubric_items=cols,
        reasoning_cols=reasoning_cols,
        hire_cols=hire_cols,
        scorer_name_map={
            "us.anthropic.claude-sonnet-4-20250514-v1:0": "Claude 4 Sonnet",
            "gpt-4.1-mini": "GPT-41-mini",
            "gpt-5": "GPT-5",
        },
        scorer_order=["GPT-5", "GPT-41-mini", "Claude 4 Sonnet"],
    )
    
    # Create singular dataframe for calculating ICC/consistency later
    filtered_df = []
    filtered_df = concat_llm_human_df(testing_df, rounded=False)
    filtered_df.to_csv(f"scores/filtered/{codebook_type}_df.csv")

    return {
        "compliance_df": compliance_df,
        "variation_df": variation_df,
        "uncertainty_df": uncertainty_df,
        "filtered_df": filtered_df
    }

# Run for all desired codebook types
results = {}
valid_ids = np.array(test_df["record_id"])
for codebook_type in ["zero-shot", "examples", "cot_examples"]:
    results[codebook_type] = evaluation_workflow(codebook_type, valid_ids, calc_compliance = True)