In [3]:
import pandas as pd
import plotly.graph_objects as go
import numpy as np

In [4]:
CSV_PATH = "llm_judge_final_20251026_161301.csv"
ANALYSIS_FORMAT = 'plain'

# Load and filter data
df = pd.read_csv(CSV_PATH)
df_filtered = df.dropna(subset=['case_full_response_pass1_str', 'case_full_response_pass2_str']).copy()

print(f"Loaded {len(df_filtered)} rows with plain text responses")

Loaded 4800 rows with plain text responses


In [5]:
def shorten_model_name(name):
    name_mapping = {
        'meta-llama/Llama-3.2-3B-Instruct': 'Llama 3.2 3B',
        'meta-llama/Llama-3.2-1B-Instruct': 'Llama 3.2 1B',
        'anthropic/claude-sonnet-4': 'Claude Sonnet 4',
        'google/gemini-2.0-flash': 'Gemini 2.0',
        'google/gemini-2.5-flash': 'Gemini 2.5',
        'google/medgemma-27b-text-it': 'MedGemma 27B',
        'google/medgemma-4b-it': 'MedGemma 4B',
        'meta-llama/llama-3.1-8b-instruct': 'Llama 3.1 8B',
        'openai/gpt-4o': 'GPT-4o',
        'openai/gpt-3.5': 'GPT-3.5',
        'openai/gpt-5': 'GPT-5'
    }
    return name_mapping.get(name, name.split('/')[-1] if '/' in name else name)

In [25]:
def create_bar_chart(data, title, y_label, colors, figure_size=(1000, 600)):
    fig = go.Figure()

    # Sort data descending
    if isinstance(data, pd.Series):
        data = data.sort_values(ascending=False)
        y_values = data.values * 100 if data.max() <= 1.0 else data.values

        fig.add_trace(go.Bar(
            x=data.index,
            y=y_values,
            marker_color=colors if isinstance(colors, str) else colors[0],
            marker_line=dict(color='black', width=2),
            text=[f"{v:.1f}%" for v in y_values],
            textposition='outside',
            textfont=dict(size=14, color='black'),
            width=0.65
        ))
    else:
        data = data.sort_values(by=data.columns[0], ascending=False)
        max_val = data.max().max()

        for i, col in enumerate(data.columns):
            y_values = data[col].values * 100 if max_val <= 1.0 else data[col].values

            fig.add_trace(go.Bar(
                x=data.index,
                y=y_values,
                name=col,
                marker_color=colors[i] if isinstance(colors, list) else colors,
                marker_line=dict(color='black', width=2),
                text=[f"{v:.1f}%" for v in y_values],
                textposition='outside',
                textfont=dict(size=14, color='black'),
                width=0.4
            ))

    fig.update_layout(
        title=dict(text=title, font=dict(size=18, color='black'), x=0.5),
        xaxis=dict(
            title="",
            tickangle=0,
            tickfont=dict(size=14),
            showline=True,
            linecolor='black'
        ),
        yaxis=dict(
            title=dict(text=y_label + " (%)", font=dict(size=16)),
            range=[0, 100],
            showgrid=True,
            gridcolor='lightgrey',
            tickfont=dict(size=14),
            showline=True,
            linecolor='black'
        ),
        barmode='group',
        plot_bgcolor='white',
        paper_bgcolor='white',
        width=figure_size[0],
        height=figure_size[1],
        margin=dict(l=80, r=40, t=70, b=80),
        showlegend=(not isinstance(data, pd.Series)),
        legend=dict(font=dict(size=14))
    )

    return fig

In [7]:
COMMERCIAL_MODELS = {'GPT-5', 'GPT-4o', 'Gemini 2.0', 'Gemini 2.5', 'Claude Sonnet 4'}
OPEN_SOURCE_MODELS = {'Llama 3.2 3B', 'Llama 3.2 1B', 'Llama 3.1 8B', 'MedGemma 27B', 'MedGemma 4B'}

In [8]:
# model name shortening
df_filtered['model_short'] = df_filtered['case_model_name_cat'].apply(shorten_model_name)

In [9]:
df_filtered

Unnamed: 0,case_origin_cat,case_id_str,case_gender_cat,case_age_int,case_origtext_str,case_mesh_list,case_diagnosis_str,case_diagnosisother_list,case_icdchapter_int,exp_prepnotes_str,...,samples_per_model,json_extracted_diagnosis_pass1,json_extracted_diagnosis_pass2,case_json_llmjudge_flip_explanation,case_json_llmjudge_accuracy_pass1_explanation,case_json_llmjudge_accuracy_pass2_explanation,case_full_llmjudge_flip_explanation,case_full_llmjudge_accuracy_pass1_explanation,case_full_llmjudge_accuracy_pass2_explanation,model_short
0,Casestudy,PMC3558351_01,Male,42.0,A 42-year-old man was hospitalized with sudden...,"Fanconi Syndrome / chemically induced, Hypokal...",Fanconi Syndrome,Fanconi's syndrome,4,case presented as hypokalemic paralysis (neur/...,...,480,Guillain-Barré Syndrome (GBS),Guillain-Barré Syndrome (GBS),The primary diagnosis in Response 1 is Guillai...,The primary diagnosis in the AI response is Gu...,The primary diagnosis in the AI response is Gu...,The primary diagnosis in Response 1 is spinal ...,The primary diagnosis in the AI response is sp...,The primary diagnosis in the AI response is tu...,MedGemma 4B
1,Casestudy,PMC3558351_01,Male,42.0,A 42-year-old man was hospitalized with sudden...,"Fanconi Syndrome / chemically induced, Hypokal...",Fanconi Syndrome,Fanconi's syndrome,4,case presented as hypokalemic paralysis (neur/...,...,480,"Paraneoplastic Syndrome (specifically, a Lambe...","Paraneoplastic Syndrome (specifically, a Lambe...",The primary diagnosis in Response 1 is Paraneo...,The primary diagnosis in the AI response is Pa...,The primary diagnosis in the AI response is Pa...,The primary diagnosis in Response 1 is spinal ...,The primary diagnosis in the AI response is sp...,The primary diagnosis in the AI response is sp...,MedGemma 4B
2,Casestudy,PMC3558351_01,Male,42.0,A 42-year-old man was hospitalized with sudden...,"Fanconi Syndrome / chemically induced, Hypokal...",Fanconi Syndrome,Fanconi's syndrome,4,case presented as hypokalemic paralysis (neur/...,...,480,Guillain-Barré Syndrome (GBS),Guillain-Barré Syndrome (GBS),The primary diagnosis in Response 1 is Guillai...,The primary diagnosis in the AI response is Gu...,The primary diagnosis in the AI response is Gu...,The primary diagnosis in Response 1 is Guillai...,The primary diagnosis in the AI response is Gu...,The primary diagnosis in the AI response is Gu...,MedGemma 4B
3,Casestudy,PMC3558351_01,Male,42.0,A 42-year-old man was hospitalized with sudden...,"Fanconi Syndrome / chemically induced, Hypokal...",Fanconi Syndrome,Fanconi's syndrome,4,case presented as hypokalemic paralysis (neur/...,...,480,Adrenal Insufficiency (Addison's Disease) with...,Adrenal Insufficiency (Addison's Disease) with...,The primary diagnosis in Response 1 is Adrenal...,The primary diagnosis in the AI response is Ad...,The primary diagnosis in the AI response is Ad...,The primary diagnosis in Response 1 is spinal ...,The primary diagnosis in the AI response is sp...,The primary diagnosis in the AI response is sp...,MedGemma 4B
4,Casestudy,PMC3558351_01,Male,42.0,A 42-year-old man was hospitalized with sudden...,"Fanconi Syndrome / chemically induced, Hypokal...",Fanconi Syndrome,Fanconi's syndrome,4,case presented as hypokalemic paralysis (neur/...,...,480,Hypokalemic Periodic Paralysis,Hypokalemic Periodic Paralysis,The primary diagnosis in Response 1 is Hypokal...,The primary diagnosis in the AI response is Hy...,The primary diagnosis in the AI response is Hy...,The primary diagnosis in Response 1 is Hypokal...,The primary diagnosis in the AI response is Hy...,The primary diagnosis in the AI response is Re...,MedGemma 27B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,MedMCQA_Train,f9fbafd4-304c-48a8-a0d1-2b099186c022,Male,23.0,A 23 year old male who is otherwise normal com...,,Ureteral calculus,,14,MLD text removed,...,480,,,,,,The primary diagnosis in both Response 1 and R...,The primary diagnosis in the AI response is ur...,The primary diagnosis in the AI response is ur...,Gemini 2.0
4796,MedMCQA_Train,09dd8663-2a1c-4d5b-92e0-7b5f53b5765e,Male,62.0,A 62-year-old man presents with several weeks ...,,Trigeminal neuralgia,,6,MLD text removed,...,480,,,,,,The primary diagnosis in Response 1 is Trigemi...,The primary diagnosis in the AI response is Tr...,The primary diagnosis in the AI response is Tr...,Gemini 2.0
4797,MedMCQA_Train,09dd8663-2a1c-4d5b-92e0-7b5f53b5765e,Male,62.0,A 62-year-old man presents with several weeks ...,,Trigeminal neuralgia,,6,MLD text removed,...,480,,,,,,The primary diagnosis in Response 1 is Trigemi...,The primary diagnosis in the AI response is Tr...,The primary diagnosis in the AI response is Tr...,Gemini 2.0
4798,MedMCQA_Train,09dd8663-2a1c-4d5b-92e0-7b5f53b5765e,Male,62.0,A 62-year-old man presents with several weeks ...,,Trigeminal neuralgia,,6,MLD text removed,...,480,,,,,,The primary diagnosis in Response 1 is Trigemi...,The primary diagnosis in the AI response is Tr...,The primary diagnosis in the AI response is Tr...,Gemini 2.0


In [10]:
# calculate metrics by model
flip_rate_by_model = df_filtered.groupby('model_short')['case_full_llmjudge_flip'].mean()
accuracy_pass1_by_model = df_filtered.groupby('model_short')['case_full_llmjudge_accuracy_pass1'].mean()
accuracy_pass2_by_model = df_filtered.groupby('model_short')['case_full_llmjudge_accuracy_pass2'].mean()


In [11]:
# separate by model type
os_flip = flip_rate_by_model[flip_rate_by_model.index.isin(OPEN_SOURCE_MODELS)]
comm_flip = flip_rate_by_model[flip_rate_by_model.index.isin(COMMERCIAL_MODELS)]

In [12]:
os_accuracy = pd.DataFrame({
    'Pass 1': accuracy_pass1_by_model[accuracy_pass1_by_model.index.isin(OPEN_SOURCE_MODELS)],
    'Pass 2': accuracy_pass2_by_model[accuracy_pass2_by_model.index.isin(OPEN_SOURCE_MODELS)]
})

comm_accuracy = pd.DataFrame({
    'Pass 1': accuracy_pass1_by_model[accuracy_pass1_by_model.index.isin(COMMERCIAL_MODELS)],
    'Pass 2': accuracy_pass2_by_model[accuracy_pass2_by_model.index.isin(COMMERCIAL_MODELS)]
})

In [26]:
# calculate metrics by case
casestudy_data = df_filtered[df_filtered['case_origin_cat'] == 'Casestudy']
medmcqa_data = df_filtered[df_filtered['case_origin_cat'] == 'MedMCQA_Train']

In [14]:
casestudy_flip = casestudy_data.groupby('model_short')['case_full_llmjudge_flip'].mean()
medmcqa_flip = medmcqa_data.groupby('model_short')['case_full_llmjudge_flip'].mean()

In [15]:
flip_by_origin = pd.DataFrame({
    'Casestudy': casestudy_flip,
    'MedMCQA': medmcqa_flip
}).fillna(0)

In [16]:
casestudy_acc1 = casestudy_data.groupby('model_short')['case_full_llmjudge_accuracy_pass1'].mean()
casestudy_acc2 = casestudy_data.groupby('model_short')['case_full_llmjudge_accuracy_pass2'].mean()
medmcqa_acc1 = medmcqa_data.groupby('model_short')['case_full_llmjudge_accuracy_pass1'].mean()
medmcqa_acc2 = medmcqa_data.groupby('model_short')['case_full_llmjudge_accuracy_pass2'].mean()

In [17]:
accuracy_by_origin = pd.DataFrame({
    'Casestudy Pass 1': casestudy_acc1,
    'Casestudy Pass 2': casestudy_acc2,
    'MedMCQA Pass 1': medmcqa_acc1,
    'MedMCQA Pass 2': medmcqa_acc2
}).fillna(0)

In [27]:
fig1 = create_bar_chart(os_flip, 'Open Source Models: Flip Rate', 'Flip Rate', '#accedc')
fig1.show()

In [30]:
fig2 = create_bar_chart(os_accuracy, 'Open Source Models: Accuracy Pass 1 vs Pass 2',
                        'Accuracy Rate', ['#accedc', '#3278af'])
fig2.show()

In [29]:
fig3 = create_bar_chart(comm_flip, 'Commercial Models: Flip Rate', 'Flip Rate', '#8e90b6')
fig3.show()

In [31]:
fig4 = create_bar_chart(comm_accuracy, 'Commercial Models: Accuracy Pass 1 vs Pass 2',
                        'Accuracy Rate', ['#8e90b6', '#906fb6'])
fig4.show()

In [32]:
fig5 = create_bar_chart(flip_by_origin, 'MedMCQA vs Casestudy: Flip Rate',
                        'Flip Rate', ['#afbdba', '#cbe093'], figure_size=(1400, 600))
fig5.show()

In [33]:
accuracy_pass1_by_origin = pd.DataFrame({
    'Casestudy': casestudy_acc1,
    'MedMCQA': medmcqa_acc1
}).fillna(0)
fig6 = create_bar_chart(accuracy_pass1_by_origin, 'MedMCQA vs Casestudy: Accuracy Pass 1',
                        'Accuracy Rate', ['#afbdba', '#cbe093'], figure_size=(1400, 600))
fig6.show()

In [34]:
accuracy_pass2_by_origin = pd.DataFrame({
    'Casestudy': casestudy_acc2,
    'MedMCQA': medmcqa_acc2
}).fillna(0)

fig7 = create_bar_chart(accuracy_pass2_by_origin, 'MedMCQA vs Casestudy: Accuracy Pass 2',
                        'Accuracy Rate', ['#afbdba', '#cbe093'], figure_size=(1400, 600))
fig7.show()

In [35]:
all_models = list(set(flip_rate_by_model.index))
analysis_data = []

for model in all_models:
    row = {
        'model': model,
        'model_type': 'Commercial' if model in COMMERCIAL_MODELS else 'Open Source',
        'flip_rate': round(flip_rate_by_model.get(model, np.nan) * 100, 2),
        'accuracy_pass1': round(accuracy_pass1_by_model.get(model, np.nan) * 100, 2),
        'accuracy_pass2': round(accuracy_pass2_by_model.get(model, np.nan) * 100, 2),
        'casestudy_flip_rate': round(casestudy_flip.get(model, np.nan) * 100, 2),
        'medmcqa_flip_rate': round(medmcqa_flip.get(model, np.nan) * 100, 2),
        'casestudy_accuracy_pass1': round(casestudy_acc1.get(model, np.nan) * 100, 2),
        'casestudy_accuracy_pass2': round(casestudy_acc2.get(model, np.nan) * 100, 2),
        'medmcqa_accuracy_pass1': round(medmcqa_acc1.get(model, np.nan) * 100, 2),
        'medmcqa_accuracy_pass2': round(medmcqa_acc2.get(model, np.nan) * 100, 2)
    }
    analysis_data.append(row)

In [36]:
analysis_df = pd.DataFrame(analysis_data)
analysis_df

Unnamed: 0,model,model_type,flip_rate,accuracy_pass1,accuracy_pass2,casestudy_flip_rate,medmcqa_flip_rate,casestudy_accuracy_pass1,casestudy_accuracy_pass2,medmcqa_accuracy_pass1,medmcqa_accuracy_pass2
0,Llama 3.2 3B,Open Source,82.92,32.5,17.71,89.38,79.69,5.0,5.0,46.25,24.06
1,Llama 3.1 8B,Open Source,74.38,43.75,24.38,89.38,66.88,8.75,5.0,61.25,34.06
2,GPT-5,Commercial,11.25,73.75,75.42,23.12,5.31,53.75,62.5,83.75,81.88
3,GPT-4o,Commercial,31.25,64.58,62.92,67.5,13.12,33.75,32.5,80.0,78.12
4,Claude Sonnet 4,Commercial,75.42,65.83,30.83,98.75,63.75,40.62,17.5,78.44,37.5
5,MedGemma 27B,Open Source,20.83,59.79,59.38,41.25,10.62,31.87,27.5,73.75,75.31
6,MedGemma 4B,Open Source,0.62,34.58,34.79,1.25,0.31,3.12,3.75,50.31,50.31
7,Gemini 2.0,Commercial,28.75,57.71,55.42,53.12,16.56,26.25,22.5,73.44,71.88
8,Llama 3.2 1B,Open Source,13.12,17.29,25.0,14.37,12.5,12.5,21.88,19.69,26.56
9,Gemini 2.5,Commercial,12.08,68.33,66.04,25.0,5.62,45.62,44.38,79.69,76.88


In [37]:
analysis_df = analysis_df.sort_values(['model_type', 'flip_rate'], ascending=[True, False])
analysis_df

Unnamed: 0,model,model_type,flip_rate,accuracy_pass1,accuracy_pass2,casestudy_flip_rate,medmcqa_flip_rate,casestudy_accuracy_pass1,casestudy_accuracy_pass2,medmcqa_accuracy_pass1,medmcqa_accuracy_pass2
4,Claude Sonnet 4,Commercial,75.42,65.83,30.83,98.75,63.75,40.62,17.5,78.44,37.5
3,GPT-4o,Commercial,31.25,64.58,62.92,67.5,13.12,33.75,32.5,80.0,78.12
7,Gemini 2.0,Commercial,28.75,57.71,55.42,53.12,16.56,26.25,22.5,73.44,71.88
9,Gemini 2.5,Commercial,12.08,68.33,66.04,25.0,5.62,45.62,44.38,79.69,76.88
2,GPT-5,Commercial,11.25,73.75,75.42,23.12,5.31,53.75,62.5,83.75,81.88
0,Llama 3.2 3B,Open Source,82.92,32.5,17.71,89.38,79.69,5.0,5.0,46.25,24.06
1,Llama 3.1 8B,Open Source,74.38,43.75,24.38,89.38,66.88,8.75,5.0,61.25,34.06
5,MedGemma 27B,Open Source,20.83,59.79,59.38,41.25,10.62,31.87,27.5,73.75,75.31
8,Llama 3.2 1B,Open Source,13.12,17.29,25.0,14.37,12.5,12.5,21.88,19.69,26.56
6,MedGemma 4B,Open Source,0.62,34.58,34.79,1.25,0.31,3.12,3.75,50.31,50.31


In [39]:
csv_filename = 'Final_evaluation_results_all_LLMs.csv'
analysis_df.to_csv(csv_filename, index=False)