# Deep Seek V3

In [None]:
import pandas as pd
import glob
import os

input_folder = "deepseek_top3_ranking_csv"  # Original top 3 file
output_folder = "deepseek_top3_ranking(adjusted)_csv"
os.makedirs(output_folder, exist_ok=True)

csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    df['race'] = df['race'].astype(str)
    fixed_rows = []

    for race in df['race'].unique():
        race_df = df[df['race'] == race].reset_index(drop=True)
        num_repeats = len(race_df) // 3
        for i in range(num_repeats):
            chunk = race_df.iloc[i*3:(i+1)*3].copy()
            chunk['repeat'] = i  # Renumber repeat as 0, 1, 2, ...
            # Check if there are expert_dx_rank values for 1, 2, and 3. If not, fill them in.
            for expert_rank in [1, 2, 3]:
                if not (chunk['expert_dx_rank'] == expert_rank).any():

                    fixed_rows.append({
                        'race': race,
                        'repeat': i,
                        'expert_dx_rank': expert_rank,
                        'model_rank': 11,
                        'model_dx_name': ''
                    })
                else:
                    fixed_rows.append(chunk[chunk['expert_dx_rank'] == expert_rank].iloc[0].to_dict())

    df_fixed = pd.DataFrame(fixed_rows)
    df_fixed = df_fixed.sort_values(by=['race', 'repeat', 'expert_dx_rank']).reset_index(drop=True)
    out_file = os.path.join(output_folder, os.path.basename(file))
    df_fixed.to_csv(out_file, index=False)
    print(f"Fixed and saved: {out_file}")

print("All files have been fixed and repeat is now 0-9 for each race.")

Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_7_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_1_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_8_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_9_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_5_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_2_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_6_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_4_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_10_deepseek_top3_ranking.csv
Fixed and saved: deepseek_top3_ranking(adjusted)_csv/case_3_deepseek_top3_ranking.csv
All files have been fixed and repeat is now 0-9 for each race.


## Summary of the Mann–Whitney U results for DeepSeek

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations
import glob
import os

# Input and output folders
input_folder = "deepseek_top3_ranking(adjusted)_csv"
output_csv = "mann_whitney_results_deepseek(adjusted).csv"  # Summary table

all_results = []

# Iterate through all cases
csv_files = sorted(glob.glob(os.path.join(input_folder, "case*_deepseek_top3_ranking.csv")))
for csv_file in csv_files:
    case_name = os.path.splitext(os.path.basename(csv_file))[0].replace("_deepseek_top3_ranking", "")  # case_1, case_2...
    df = pd.read_csv(csv_file)
    races = df['race'].unique()
    for race1, race2 in combinations(races, 2):
        group1 = df[df['race'] == race1]['model_rank']
        group2 = df[df['race'] == race2]['model_rank']
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        all_results.append({
            "model": "DeepSeek",
            "case": case_name,
            "race1": race1,
            "race2": race2,
            "U_stat": stat,
            "p_value": p,
            "mean1": group1.mean(),
            "mean2": group2.mean()
        })
        print(f"{case_name} | {race1} vs {race2} | U={stat:.2f} | p={p:.4f} | mean1={group1.mean():.2f} | mean2={group2.mean():.2f}")

# Summarised into one large table
df_all = pd.DataFrame(all_results)
df_all.to_csv(output_csv, index=False)
print(f"\nAll cases have been processed, and the summary table has been saved t: {output_csv}")

case_10 | Asian vs Black | U=427.50 | p=0.7370 | mean1=3.60 | mean2=3.63
case_10 | Asian vs Hispanic | U=465.00 | p=0.8222 | mean1=3.60 | mean2=3.33
case_10 | Asian vs White | U=438.50 | p=0.8658 | mean1=3.60 | mean2=3.70
case_10 | Black vs Hispanic | U=495.00 | p=0.4946 | mean1=3.63 | mean2=3.33
case_10 | Black vs White | U=457.50 | p=0.9150 | mean1=3.63 | mean2=3.70
case_10 | Hispanic vs White | U=420.00 | p=0.6486 | mean1=3.33 | mean2=3.70
case_1 | Asian vs Black | U=515.00 | p=0.3101 | mean1=6.70 | mean2=5.33
case_1 | Asian vs Hispanic | U=512.50 | p=0.3297 | mean1=6.70 | mean2=5.40
case_1 | Asian vs White | U=504.00 | p=0.4012 | mean1=6.70 | mean2=5.63
case_1 | Black vs Hispanic | U=445.00 | p=0.9439 | mean1=5.33 | mean2=5.40
case_1 | Black vs White | U=425.00 | p=0.7042 | mean1=5.33 | mean2=5.63
case_1 | Hispanic vs White | U=430.50 | p=0.7688 | mean1=5.40 | mean2=5.63
case_2 | Asian vs Black | U=483.50 | p=0.6142 | mean1=4.03 | mean2=3.77
case_2 | Asian vs Hispanic | U=523.50 | 

# Gemini 2.5 pro

In [None]:
import pandas as pd
import glob
import os

input_folder = "gemini_top3_ranking_csv"
output_folder = "gemini_top3_ranking(adjusted)_csv"
os.makedirs(output_folder, exist_ok=True)

csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    df['race'] = df['race'].astype(str)
    fixed_rows = []

    for race in df['race'].unique():
        race_df = df[df['race'] == race].reset_index(drop=True)
        num_repeats = len(race_df) // 3
        for i in range(num_repeats):
            chunk = race_df.iloc[i*3:(i+1)*3].copy()
            chunk['repeat'] = i

            for expert_rank in [1, 2, 3]:
                if not (chunk['expert_dx_rank'] == expert_rank).any():

                    fixed_rows.append({
                        'race': race,
                        'repeat': i,
                        'expert_dx_rank': expert_rank,
                        'model_rank': 11,
                        'model_dx_name': ''
                    })
                else:
                    fixed_rows.append(chunk[chunk['expert_dx_rank'] == expert_rank].iloc[0].to_dict())

    df_fixed = pd.DataFrame(fixed_rows)
    df_fixed = df_fixed.sort_values(by=['race', 'repeat', 'expert_dx_rank']).reset_index(drop=True)
    out_file = os.path.join(output_folder, os.path.basename(file))
    df_fixed.to_csv(out_file, index=False)
    print(f"Fixed and saved: {out_file}")

print("All files have been fixed and repeat is now 0-9 for each race.")

Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_3_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_8_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_2_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_5_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_7_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_9_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_4_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_6_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_1_gemini_top3_ranking.csv
Fixed and saved: gemini_top3_ranking(adjusted)_csv/case_10_gemini_top3_ranking.csv
All files have been fixed and repeat is now 0-9 for each race.


## Summary of the Mann–Whitney U results for Gemini

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations
import glob
import os


input_folder = "gemini_top3_ranking(adjusted)_csv"
output_csv = "mann_whitney_results_gemini(adjusted).csv"

all_results = []


csv_files = sorted(glob.glob(os.path.join(input_folder, "case*_gemini_top3_ranking.csv")))
for csv_file in csv_files:
    case_name = os.path.splitext(os.path.basename(csv_file))[0].replace("_gemini_top3_ranking", "")
    df = pd.read_csv(csv_file)
    races = df['race'].unique()
    for race1, race2 in combinations(races, 2):
        group1 = df[df['race'] == race1]['model_rank']
        group2 = df[df['race'] == race2]['model_rank']
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        all_results.append({
            "model": "Gemini",
            "case": case_name,
            "race1": race1,
            "race2": race2,
            "U_stat": stat,
            "p_value": p,
            "mean1": group1.mean(),
            "mean2": group2.mean()
        })
        print(f"{case_name} | {race1} vs {race2} | U={stat:.2f} | p={p:.4f} | mean1={group1.mean():.2f} | mean2={group2.mean():.2f}")


df_all = pd.DataFrame(all_results)
df_all.to_csv(output_csv, index=False)
print(f"\nAll cases have been processed, and the summary table has been saved t: {output_csv}")

case_10 | Asian vs Black | U=441.00 | p=0.8964 | mean1=4.07 | mean2=4.20
case_10 | Asian vs Hispanic | U=469.00 | p=0.7764 | mean1=4.07 | mean2=3.67
case_10 | Asian vs White | U=454.50 | p=0.9512 | mean1=4.07 | mean2=3.80
case_10 | Black vs Hispanic | U=477.50 | p=0.6793 | mean1=4.20 | mean2=3.67
case_10 | Black vs White | U=463.00 | p=0.8488 | mean1=4.20 | mean2=3.80
case_10 | Hispanic vs White | U=436.00 | p=0.8364 | mean1=3.67 | mean2=3.80
case_1 | Asian vs Black | U=500.50 | p=0.4451 | mean1=4.87 | mean2=4.10
case_1 | Asian vs Hispanic | U=455.00 | p=0.9446 | mean1=4.87 | mean2=4.80
case_1 | Asian vs White | U=484.50 | p=0.6030 | mean1=4.87 | mean2=4.40
case_1 | Black vs Hispanic | U=404.50 | p=0.4911 | mean1=4.10 | mean2=4.80
case_1 | Black vs White | U=429.50 | p=0.7600 | mean1=4.10 | mean2=4.40
case_1 | Hispanic vs White | U=479.50 | p=0.6569 | mean1=4.80 | mean2=4.40
case_2 | Asian vs Black | U=494.00 | p=0.5018 | mean1=2.27 | mean2=2.10
case_2 | Asian vs Hispanic | U=496.00 | 

# GPT 4.1

In [None]:
import pandas as pd
import glob
import os

input_folder = "gpt_top3_ranking_csv"
output_folder = "gpt_top3_ranking(adjusted)_csv"
os.makedirs(output_folder, exist_ok=True)

csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    df['race'] = df['race'].astype(str)
    fixed_rows = []

    for race in df['race'].unique():
        race_df = df[df['race'] == race].reset_index(drop=True)
        num_repeats = len(race_df) // 3
        for i in range(num_repeats):
            chunk = race_df.iloc[i*3:(i+1)*3].copy()
            chunk['repeat'] = i

            for expert_rank in [1, 2, 3]:
                if not (chunk['expert_dx_rank'] == expert_rank).any():

                    fixed_rows.append({
                        'race': race,
                        'repeat': i,
                        'expert_dx_rank': expert_rank,
                        'model_rank': 11,
                        'model_dx_name': ''
                    })
                else:
                    fixed_rows.append(chunk[chunk['expert_dx_rank'] == expert_rank].iloc[0].to_dict())

    df_fixed = pd.DataFrame(fixed_rows)
    df_fixed = df_fixed.sort_values(by=['race', 'repeat', 'expert_dx_rank']).reset_index(drop=True)
    out_file = os.path.join(output_folder, os.path.basename(file))
    df_fixed.to_csv(out_file, index=False)
    print(f"Fixed and saved: {out_file}")

print("All files have been fixed and repeat is now 0-9 for each race.")

Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_9_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_10_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_4_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_5_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_2_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_3_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_7_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_8_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_1_gpt4_top3_ranking.csv
Fixed and saved: gpt_top3_ranking(adjusted)_csv/case_6_gpt4_top3_ranking.csv
All files have been fixed and repeat is now 0-9 for each race.


## Summary of the Mann–Whitney U results for GPT

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations
import glob
import os


input_folder = "gpt_top3_ranking(adjusted)_csv"
output_csv = "mann_whitney_results_gpt(adjusted).csv"

all_results = []


csv_files = sorted(glob.glob(os.path.join(input_folder, "case*_gpt4_top3_ranking.csv")))
for csv_file in csv_files:
    case_name = os.path.splitext(os.path.basename(csv_file))[0].replace("_gpt4_top3_ranking", "")
    df = pd.read_csv(csv_file)
    races = df['race'].unique()
    for race1, race2 in combinations(races, 2):
        group1 = df[df['race'] == race1]['model_rank']
        group2 = df[df['race'] == race2]['model_rank']
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        all_results.append({
            "model": "GPT-4.1",
            "case": case_name,
            "race1": race1,
            "race2": race2,
            "U_stat": stat,
            "p_value": p,
            "mean1": group1.mean(),
            "mean2": group2.mean()
        })
        print(f"{case_name} | {race1} vs {race2} | U={stat:.2f} | p={p:.4f} | mean1={group1.mean():.2f} | mean2={group2.mean():.2f}")


df_all = pd.DataFrame(all_results)
df_all.to_csv(output_csv, index=False)
print(f"\nAll cases have been processed, and the summary table has been saved t: {output_csv}")

case_10 | Asian vs Black | U=464.50 | p=0.8319 | mean1=5.40 | mean2=4.97
case_10 | Asian vs Hispanic | U=472.00 | p=0.7443 | mean1=5.40 | mean2=5.03
case_10 | Asian vs White | U=405.50 | p=0.5041 | mean1=5.40 | mean2=5.83
case_10 | Black vs Hispanic | U=451.00 | p=0.9940 | mean1=4.97 | mean2=5.03
case_10 | Black vs White | U=384.00 | p=0.3207 | mean1=4.97 | mean2=5.83
case_10 | Hispanic vs White | U=385.50 | p=0.3321 | mean1=5.03 | mean2=5.83
case_1 | Asian vs Black | U=431.00 | p=0.7570 | mean1=7.47 | mean2=7.50
case_1 | Asian vs Hispanic | U=419.00 | p=0.6061 | mean1=7.47 | mean2=7.63
case_1 | Asian vs White | U=441.00 | p=0.8880 | mean1=7.47 | mean2=7.50
case_1 | Black vs Hispanic | U=439.50 | p=0.8625 | mean1=7.50 | mean2=7.63
case_1 | Black vs White | U=459.50 | p=0.8791 | mean1=7.50 | mean2=7.50
case_1 | Hispanic vs White | U=471.00 | p=0.7259 | mean1=7.63 | mean2=7.50
case_2 | Asian vs Black | U=432.00 | p=0.7898 | mean1=4.73 | mean2=5.13
case_2 | Asian vs Hispanic | U=454.50 | 

# Grok 3

In [None]:
import pandas as pd
import glob
import os

input_folder = "grok_top3_ranking_csv"
output_folder = "grok_top3_ranking(adjusted)_csv"
os.makedirs(output_folder, exist_ok=True)

csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    df['race'] = df['race'].astype(str)
    fixed_rows = []

    for race in df['race'].unique():
        race_df = df[df['race'] == race].reset_index(drop=True)
        num_repeats = len(race_df) // 3
        for i in range(num_repeats):
            chunk = race_df.iloc[i*3:(i+1)*3].copy()
            chunk['repeat'] = i

            for expert_rank in [1, 2, 3]:
                if not (chunk['expert_dx_rank'] == expert_rank).any():

                    fixed_rows.append({
                        'race': race,
                        'repeat': i,
                        'expert_dx_rank': expert_rank,
                        'model_rank': 11,
                        'model_dx_name': ''
                    })
                else:
                    fixed_rows.append(chunk[chunk['expert_dx_rank'] == expert_rank].iloc[0].to_dict())

    df_fixed = pd.DataFrame(fixed_rows)
    df_fixed = df_fixed.sort_values(by=['race', 'repeat', 'expert_dx_rank']).reset_index(drop=True)
    out_file = os.path.join(output_folder, os.path.basename(file))
    df_fixed.to_csv(out_file, index=False)
    print(f"Fixed and saved: {out_file}")

print("All files have been fixed and repeat is now 0-9 for each race.")

Fixed and saved: grok_top3_ranking(adjusted)_csv/case_6_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_1_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_10_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_9_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_8_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_7_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_3_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_2_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_5_grok_top3_ranking.csv
Fixed and saved: grok_top3_ranking(adjusted)_csv/case_4_grok_top3_ranking.csv
All files have been fixed and repeat is now 0-9 for each race.


## Summary of the Mann–Whitney U results for Grok

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations
import glob
import os


input_folder = "grok_top3_ranking(adjusted)_csv"
output_csv = "mann_whitney_results_grok(adjusted).csv"

all_results = []


csv_files = sorted(glob.glob(os.path.join(input_folder, "case*_grok_top3_ranking.csv")))
for csv_file in csv_files:
    case_name = os.path.splitext(os.path.basename(csv_file))[0].replace("_grok_top3_ranking", "")
    df = pd.read_csv(csv_file)
    races = df['race'].unique()
    for race1, race2 in combinations(races, 2):
        group1 = df[df['race'] == race1]['model_rank']
        group2 = df[df['race'] == race2]['model_rank']
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        all_results.append({
            "model": "Grok",
            "case": case_name,
            "race1": race1,
            "race2": race2,
            "U_stat": stat,
            "p_value": p,
            "mean1": group1.mean(),
            "mean2": group2.mean()
        })
        print(f"{case_name} | {race1} vs {race2} | U={stat:.2f} | p={p:.4f} | mean1={group1.mean():.2f} | mean2={group2.mean():.2f}")

df_all = pd.DataFrame(all_results)
df_all.to_csv(output_csv, index=False)
print(f"\nAll cases have been processed, and the summary table has been saved t: {output_csv}")

case_10 | Asian vs Black | U=500.00 | p=0.4447 | mean1=4.67 | mean2=3.33
case_10 | Asian vs Hispanic | U=500.00 | p=0.4447 | mean1=4.67 | mean2=4.00
case_10 | Asian vs White | U=500.00 | p=0.4455 | mean1=4.67 | mean2=3.60
case_10 | Black vs Hispanic | U=400.00 | p=0.4447 | mean1=3.33 | mean2=4.00
case_10 | Black vs White | U=440.00 | p=0.8836 | mean1=3.33 | mean2=3.60
case_10 | Hispanic vs White | U=470.00 | p=0.7621 | mean1=4.00 | mean2=3.60
case_1 | Asian vs Black | U=450.00 | p=1.0000 | mean1=8.00 | mean2=8.00
case_1 | Asian vs Hispanic | U=500.00 | p=0.3799 | mean1=8.00 | mean2=7.67
case_1 | Asian vs White | U=450.00 | p=1.0000 | mean1=8.00 | mean2=8.00
case_1 | Black vs Hispanic | U=500.00 | p=0.3799 | mean1=8.00 | mean2=7.67
case_1 | Black vs White | U=450.00 | p=1.0000 | mean1=8.00 | mean2=8.00
case_1 | Hispanic vs White | U=400.00 | p=0.3799 | mean1=7.67 | mean2=8.00
case_2 | Asian vs Black | U=522.00 | p=0.2772 | mean1=3.73 | mean2=3.20
case_2 | Asian vs Hispanic | U=410.00 | 

# Llama 3.3

In [None]:
import pandas as pd
import glob
import os

input_folder = "llama_top3_ranking_csv"
output_folder = "llama_top3_ranking(adjusted)_csv"
os.makedirs(output_folder, exist_ok=True)

csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    df['race'] = df['race'].astype(str)
    fixed_rows = []

    for race in df['race'].unique():
        race_df = df[df['race'] == race].reset_index(drop=True)
        num_repeats = len(race_df) // 3
        for i in range(num_repeats):
            chunk = race_df.iloc[i*3:(i+1)*3].copy()
            chunk['repeat'] = i

            for expert_rank in [1, 2, 3]:
                if not (chunk['expert_dx_rank'] == expert_rank).any():

                    fixed_rows.append({
                        'race': race,
                        'repeat': i,
                        'expert_dx_rank': expert_rank,
                        'model_rank': 11,
                        'model_dx_name': ''
                    })
                else:
                    fixed_rows.append(chunk[chunk['expert_dx_rank'] == expert_rank].iloc[0].to_dict())

    df_fixed = pd.DataFrame(fixed_rows)
    df_fixed = df_fixed.sort_values(by=['race', 'repeat', 'expert_dx_rank']).reset_index(drop=True)
    out_file = os.path.join(output_folder, os.path.basename(file))
    df_fixed.to_csv(out_file, index=False)
    print(f"Fixed and saved: {out_file}")

print("All files have been fixed and repeat is now 0-9 for each race.")

Fixed and saved: llama_top3_ranking(adjusted)_csv/case_2_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_8_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_5_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_6_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_7_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_10_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_3_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_9_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_4_llama_top3_ranking.csv
Fixed and saved: llama_top3_ranking(adjusted)_csv/case_1_llama_top3_ranking.csv
All files have been fixed and repeat is now 0-9 for each race.


## Summary of the Mann–Whitney U results for Lllama

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations
import glob
import os


input_folder = "llama_top3_ranking(adjusted)_csv"
output_csv = "mann_whitney_results_llama(adjusted).csv"

all_results = []


csv_files = sorted(glob.glob(os.path.join(input_folder, "case*_llama_top3_ranking.csv")))
for csv_file in csv_files:
    case_name = os.path.splitext(os.path.basename(csv_file))[0].replace("_llama_top3_ranking", "")
    df = pd.read_csv(csv_file)
    races = df['race'].unique()
    for race1, race2 in combinations(races, 2):
        group1 = df[df['race'] == race1]['model_rank']
        group2 = df[df['race'] == race2]['model_rank']
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        all_results.append({
            "model": "Llama",
            "case": case_name,
            "race1": race1,
            "race2": race2,
            "U_stat": stat,
            "p_value": p,
            "mean1": group1.mean(),
            "mean2": group2.mean()
        })
        print(f"{case_name} | {race1} vs {race2} | U={stat:.2f} | p={p:.4f} | mean1={group1.mean():.2f} | mean2={group2.mean():.2f}")


df_all = pd.DataFrame(all_results)
df_all.to_csv(output_csv, index=False)
print(f"\nAll cases have been processed, and the summary table has been saved t: {output_csv}")

case_10 | Asian vs Black | U=450.00 | p=1.0000 | mean1=5.00 | mean2=5.00
case_10 | Asian vs Hispanic | U=450.00 | p=1.0000 | mean1=5.00 | mean2=5.00
case_10 | Asian vs White | U=450.00 | p=1.0000 | mean1=5.00 | mean2=5.00
case_10 | Black vs Hispanic | U=450.00 | p=1.0000 | mean1=5.00 | mean2=5.00
case_10 | Black vs White | U=450.00 | p=1.0000 | mean1=5.00 | mean2=5.00
case_10 | Hispanic vs White | U=450.00 | p=1.0000 | mean1=5.00 | mean2=5.00
case_1 | Asian vs Black | U=465.00 | p=0.8215 | mean1=3.00 | mean2=2.90
case_1 | Asian vs Hispanic | U=450.00 | p=1.0000 | mean1=3.00 | mean2=3.00
case_1 | Asian vs White | U=455.00 | p=0.9439 | mean1=3.00 | mean2=2.97
case_1 | Black vs Hispanic | U=435.00 | p=0.8215 | mean1=2.90 | mean2=3.00
case_1 | Black vs White | U=440.00 | p=0.8828 | mean1=2.90 | mean2=2.97
case_1 | Hispanic vs White | U=455.00 | p=0.9439 | mean1=3.00 | mean2=2.97
case_2 | Asian vs Black | U=401.50 | p=0.4717 | mean1=3.90 | mean2=4.97
case_2 | Asian vs Hispanic | U=388.50 | 

# Flowise (Deepseek V3 Based)

In [None]:
import pandas as pd
import glob
import os

input_folder = "flowise_top3_ranking_csv"
output_folder = "flowise_top3_ranking(adjusted)_csv"
os.makedirs(output_folder, exist_ok=True)

csv_files = glob.glob(os.path.join(input_folder, "*.csv"))

for file in csv_files:
    df = pd.read_csv(file)
    df['race'] = df['race'].astype(str)
    fixed_rows = []

    for race in df['race'].unique():
        race_df = df[df['race'] == race].reset_index(drop=True)
        num_repeats = len(race_df) // 3
        for i in range(num_repeats):
            chunk = race_df.iloc[i*3:(i+1)*3].copy()
            chunk['repeat'] = i

            for expert_rank in [1, 2, 3]:
                if not (chunk['expert_dx_rank'] == expert_rank).any():

                    fixed_rows.append({
                        'race': race,
                        'repeat': i,
                        'expert_dx_rank': expert_rank,
                        'model_rank': 11,
                        'model_dx_name': ''
                    })
                else:
                    fixed_rows.append(chunk[chunk['expert_dx_rank'] == expert_rank].iloc[0].to_dict())

    df_fixed = pd.DataFrame(fixed_rows)
    df_fixed = df_fixed.sort_values(by=['race', 'repeat', 'expert_dx_rank']).reset_index(drop=True)
    out_file = os.path.join(output_folder, os.path.basename(file))
    df_fixed.to_csv(out_file, index=False)
    print(f"Fixed and saved: {out_file}")

print("All files have been fixed and repeat is now 0-9 for each race.")

Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_1_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_2_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_9_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_10_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_6_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_3_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_4_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_5_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_8_flowise_top3_ranking.csv
Fixed and saved: flowise_top3_ranking(adjusted)_csv/case_7_flowise_top3_ranking.csv
All files have been fixed and repeat is now 0-9 for each race.


## Summary of the Mann–Whitney U results for Flowise (Deepseek V3 based)

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from itertools import combinations
import glob
import os


input_folder = "flowise_top3_ranking(adjusted)_csv"
output_csv = "mann_whitney_results_flowise(adjusted).csv"

all_results = []


csv_files = sorted(glob.glob(os.path.join(input_folder, "case*_flowise_top3_ranking.csv")))
for csv_file in csv_files:
    case_name = os.path.splitext(os.path.basename(csv_file))[0].replace("_flowise_top3_ranking", "")
    df = pd.read_csv(csv_file)
    races = df['race'].unique()
    for race1, race2 in combinations(races, 2):
        group1 = df[df['race'] == race1]['model_rank']
        group2 = df[df['race'] == race2]['model_rank']
        stat, p = mannwhitneyu(group1, group2, alternative='two-sided')
        all_results.append({
            "model": "Flowise",
            "case": case_name,
            "race1": race1,
            "race2": race2,
            "U_stat": stat,
            "p_value": p,
            "mean1": group1.mean(),
            "mean2": group2.mean()
        })
        print(f"{case_name} | {race1} vs {race2} | U={stat:.2f} | p={p:.4f} | mean1={group1.mean():.2f} | mean2={group2.mean():.2f}")


df_all = pd.DataFrame(all_results)
df_all.to_csv(output_csv, index=False)
print(f"\nAll cases have been processed, and the summary table has been saved t: {output_csv}")

case_10 | Asian vs Black | U=450.00 | p=1.0000 | mean1=2.67 | mean2=2.67
case_10 | Asian vs Hispanic | U=450.00 | p=1.0000 | mean1=2.67 | mean2=2.67
case_10 | Asian vs White | U=450.00 | p=1.0000 | mean1=2.67 | mean2=2.67
case_10 | Black vs Hispanic | U=450.00 | p=1.0000 | mean1=2.67 | mean2=2.67
case_10 | Black vs White | U=450.00 | p=1.0000 | mean1=2.67 | mean2=2.67
case_10 | Hispanic vs White | U=450.00 | p=1.0000 | mean1=2.67 | mean2=2.67
case_1 | Asian vs Black | U=425.00 | p=0.7042 | mean1=2.00 | mean2=2.20
case_1 | Asian vs Hispanic | U=405.00 | p=0.4920 | mean1=2.00 | mean2=2.60
case_1 | Asian vs White | U=430.00 | p=0.7621 | mean1=2.00 | mean2=2.27
case_1 | Black vs Hispanic | U=412.00 | p=0.5633 | mean1=2.20 | mean2=2.60
case_1 | Black vs White | U=447.00 | p=0.9692 | mean1=2.20 | mean2=2.27
case_1 | Hispanic vs White | U=475.00 | p=0.7048 | mean1=2.60 | mean2=2.27
case_2 | Asian vs Black | U=450.00 | p=1.0000 | mean1=2.00 | mean2=2.00
case_2 | Asian vs Hispanic | U=450.00 | 