In [1]:
import pandas as pd
import json
import random


kidney_features = {
    "ABO_DON" : "DONOR BLOOD TYPE",
    "DA1" : "DONOR A1 ANTIGEN",
    "DA2" : "DONOR A2 ANTIGEN",
    "DB1" : "DONOR B1 ANTIGEN",
    "DB2" : "DONOR B2 ANTIGEN",
    "DDR1" : "DONOR DR1 ANTIGEN",
    "DDR2" : "DONOR DR2 ANTIGEN",
    "AGE_DON" : "DONOR AGE (YRS)",
    "HGT_CM_DON_CALC" : "CALCULATED DONOR HEIGHT (CM)",
    "WGT_KG_DON_CALC" : "CALCULATED DONOR WEIGHT (KG)",
    "GENDER_DON" : "DONOR GENDER",
    "ETHCAT_DON" : "DONOR ETHNICITY CATEGORY",
    "HOME_STATE_DON" : "DONOR HOME STATE",
}

kidney_features_demo = {
    "GENDER_DON" : "DONOR GENDER",
    "ETHCAT_DON" : "DONOR ETHNICITY CATEGORY",
    "HOME_STATE_DON" : "DONOR HOME STATE",
}

candidate_features = {
    "ABO" : "CANDIDATE BLOOD TYPE",
    "A1" : "CANDIDATE A1 ANTIGEN",
    "A2" : "CANDIDATE A2 ANTIGEN",
    "B1" : "CANDIDATE B1 ANTIGEN",
    "B2" : "CANDIDATE B2 ANTIGEN",
    "DR1" : "CANDIDATE DR1 ANTIGEN",
    "DR2" : "CANDIDATE DR2 ANTIGEN",
    "AGE" : "CANDIDATE AGE (YRS)",
    "HGT_CM_TCR" : "CALCULATED CANDIDATE HEIGHT (CM)",
    "WGT_KG_TCR" : "CALCULATED CANDIDATE WEIGHT (KG)",
    "GENDER" : "CANDIDATE GENDER",
    "ETHCAT" : "CANDIDATE ETHNICITY CATEGORY",
    "EDUCATION" : "CANDIDATE HIGHEST EDUCATIONAL LEVEL",
    "WORK_INCOME_TCR" : "CANDIDATE WORK FOR INCOME?",
    "DAYSWAIT_CHRON_KI" : "CANDIDATE DAYS ON WAITLIST",
    "PERM_STATE" : "CANDIDATE STATE OF RESIDENCY",
}

candidate_features_demo = {
    "GENDER" : "CANDIDATE GENDER",
    "ETHCAT" : "CANDIDATE ETHNICITY CATEGORY",
    "EDUCATION" : "CANDIDATE HIGHEST EDUCATIONAL LEVEL",
    "WORK_INCOME_TCR" : "CANDIDATE WORK FOR INCOME?",
    "DAYSWAIT_CHRON_KI" : "CANDIDATE DAYS ON WAITLIST",
    "PERM_STATE" : "CANDIDATE STATE OF RESIDENCY",
}

In [2]:
def get_kidney_info(kidney_index, data_type = ["binary","ranked"]):

    kidneys_df = pd.read_csv("data/kidneys.csv", index_col=0)
    
    if data_type == "binary":
        columns_to_remove = ["GENDER_DON","ETHCAT_DON", "HOME_STATE_DON"]
        kidneys_df = kidneys_df.drop(columns=columns_to_remove)
    
    # Get one row of kidney
    kidney = kidneys_df.iloc[kidney_index]

    # Extract columns and values
    columns = kidneys_df.columns.tolist()
    values = kidney.tolist()
        
    # Format values
    formatted_values = []
    for value in values:
        if pd.isna(value):
            value_str = ""
        else:
            value_str = str(value)
            if ',' in value_str or '"' in value_str:
                value_str = '"' + value_str.replace('"', '""') + '"'
        formatted_values.append(value_str)

    kidney_info = ",".join(columns) + "\n" + ",".join(formatted_values)
    # print(kidney_info)

    return kidney_info

# Binary

In [None]:
def get_one_candidate_binary_info(candidate_index):
    candidates_df = pd.read_csv("data/candidate-binary-raw.csv") # index_col=0 is not added b/c we want to keep the candidate ID

    # Specify the columns we want to remove
    columns_to_remove = ['GENDER', 'ETHCAT', 'EDUCATION', 'WORK_INCOME_TCR', 'DAYSWAIT_CHRON_KI', 'PERM_STATE']

    # Extract specific columns and row
    selected_df = candidates_df.drop(columns=columns_to_remove)
    row = selected_df.iloc[candidate_index]

    # Extract columns and specific row
    columns = selected_df.columns.tolist()
    row = selected_df.iloc[candidate_index]

    # Format values
    # formatted_rows = []
    # for row in rows:
    formatted_values = []
    for value in row:
        if pd.isna(value):
            value_str = ""
        else:
            value_str = str(value)
            if ',' in value_str or '"' in value_str:
                value_str = '"' + value_str.replace('"', '""') + '"'
        formatted_values.append(value_str)
    # formatted_rows.append(",".join(formatted_values))
    candidate_binary_info = ",".join(columns) + "\n" + ",".join(formatted_values)
    return candidate_binary_info

    # candidates_binary_info = ",".join(columns) + "\n" + "\n".join(formatted_rows)
    # print(candidates_binary_info)
    # return candidates_binary_info

In [None]:
def get_prompt_test_task(includesInst):
    prompt_test_task = """You will be given information about a kidney and a potential candidate.
    You will determine if the kidney is compatible with the candidate """

    if not includesInst:
        prompt_test_task += "based on your knowledge."
    else:
        prompt_test_task += "based on the following criteria:\n"
        prompt_test_task += "1. The blood type of the donor must be compatible with the blood type of the candidate.\n"
        prompt_test_task += "   - If the donor is type O, they can donate to any blood type.\n"
        prompt_test_task += "   - If the donor is type A, they can donate to type A and AB.\n"
        prompt_test_task += "   - If the donor is type B, they can donate to type B and AB.\n"
        prompt_test_task += "   - If the donor is type AB, they can only donate to type AB.\n"
        prompt_test_task += "2. The difference in the height of the donor and of the candidate must be within 12.5cm.\n"
        prompt_test_task += "3. The difference in weight of the donor and of the candidate must be within 15kg.\n"

    return prompt_test_task

In [None]:
def get_prompt_test_0(kidney_features, candidate_features):
    prompt_test_0 = f"""Here are some descriptions of the features I'd like you to consider.

    Kidney Features: {kidney_features}
    Candidate Features: {candidate_features}

    Remember this information for the next sets of tasks."""

    return prompt_test_0

In [None]:
def get_prompt_test_1(candidate_info):
    prompt_test_1 = f"""Here is the candidate information in CSV format.

    {candidate_info}

    Remember this information for the next sets of tasks."""

    return prompt_test_1

In [None]:
def get_prompt_test_2(kidney_info):
    prompt_test_2 = f"""Here is the kidney information in CSV format.

    {kidney_info}

    Please determine if this kidney is compatible for the candidate, based on the information you have received.
    Please follow the format below:
    Candidate ID, Compatibility (Yes/No)
    
    Just output the results in the format of Candidate ID and Compatibiltiy (Yes/No), separated by a comma."""

    return prompt_test_2

In [None]:
def get_prompt_test_few_shot(kidney_example, candidate_example, justification):

    match_kidney = kidney_example[0]
    match_candidate = candidate_example[0]
    # match_justification = justification[0]

    mismatch_kidney1 = kidney_example[1]
    mismatch_candidate1 = candidate_example[1]
    mismatch_justification1 = justification[1]

    mismatch_kidney2 = kidney_example[2]
    mismatch_candidate2 = candidate_example[2]
    mismatch_justification2 = justification[2]

    mismatch_kidney3 = kidney_example[3]
    mismatch_candidate3 = candidate_example[3]
    mismatch_justification3 = justification[3]


    prompt_test_few_shot = f"""For example,

    A kidney with this profile: {mismatch_kidney1}
    A candidate with this profile: {mismatch_candidate1}
    Is not a compatible match because {mismatch_justification1}.

    A kidney with this profile: {mismatch_kidney2}
    A candidate with this profile: {mismatch_candidate2}
    Is not a compatible match because {mismatch_justification2}.

    A kidney with this profile: {mismatch_kidney3}
    A candidate with this profile: {mismatch_candidate3}
    Is not a compatible match because {mismatch_justification3}

    A kidney with this profile: {match_kidney}
    A candidate with this profile: {match_candidate}
    Is a compatible match.

    Now determine if the provided kidney and candidate are a compatible match."""

    return prompt_test_few_shot

In [None]:
def get_few_shot_examples(excluded_indices):
    match_df = pd.read_csv("data/candidate-binary-answer-NEW.csv")
    examples = {}
    example_kidney = []
    example_candidate = []
    example_justification = []

    # Find a Match
    match_rows = match_df[match_df.eq('match').any(axis=1)]
    match_rows = match_rows[~match_rows.index.isin(excluded_indices)]
    if not match_rows.empty:
        match_row = match_rows.sample(n=1).iloc[0]
        match_id = match_row.name
        match_col = match_row[match_row == 'match'].index[0]
        examples['match'] = {match_id: match_col}
        example_candidate.append(match_id)
        example_kidney.append(int(match_col.split('_')[1]))
        example_justification.append(match_row[match_col])

    # Find an ABO Mismatch
    abo_rows = match_df[match_df.apply(lambda row: row.astype(str).str.contains('abo').any(), axis=1)]
    abo_rows = abo_rows[~abo_rows.index.isin(excluded_indices)]
    if not abo_rows.empty:
        abo_row = abo_rows.sample(n=1).iloc[0]
        abo_id = abo_row.name
        abo_col = abo_row[abo_row.astype(str).str.contains('abo')].index[0]
        abo_justification = abo_row[abo_col]
        examples['abo_mismatch'] = [abo_id, abo_col, abo_justification]
        example_candidate.append(abo_id)
        example_kidney.append(int(abo_col.split('_')[1]))
        example_justification.append(abo_justification)

    # Find a Height Mismatch
    height_rows = match_df[match_df.apply(lambda row: row.astype(str).str.contains('height').any(), axis=1)]
    height_rows = height_rows[~height_rows.index.isin(excluded_indices)]
    if not height_rows.empty:
        height_row = height_rows.sample(n=1).iloc[0]
        height_id = height_row.name
        height_col = height_row[height_row.astype(str).str.contains('height')].index[0]
        height_justification = height_row[height_col]
        examples['height_mismatch'] = [height_id, height_col, height_justification]
        example_candidate.append(height_id)
        example_kidney.append(int(height_col.split('_')[1]))
        example_justification.append(height_justification)

    # Find a Weight Mismatch
    weight_rows = match_df[match_df.apply(lambda row: row.astype(str).str.contains('weight').any(), axis=1)]
    weight_rows = weight_rows[~weight_rows.index.isin(excluded_indices)]
    if not weight_rows.empty:
        weight_row = weight_rows.sample(n=1).iloc[0]
        weight_id = weight_row.name
        weight_col = weight_row[weight_row.astype(str).str.contains('weight')].index[0]
        weight_justification = weight_row[weight_col]
        examples['weight_mismatch'] = [weight_id, weight_col, weight_justification]
        example_candidate.append(weight_id)
        example_kidney.append(int(weight_col.split('_')[1]))
        example_justification.append(weight_justification)

    return example_candidate, example_kidney, example_justification

In [None]:
# save prompts

# exp_to_run:
# plain (0): includesInst=False, includesFewShot=False
# zero-shot (1): includesInst=True, includesFewShot=False
# few-shot (2): includesInst=True, includesFewShot=True

kidney_indices = random.sample(range(20), 5)
candidate_indices = random.sample(range(100), 5)

for exp_to_run in [0, 1, 2]:
    if exp_to_run == 0:
        includesInst = False
        includesFewShot = False
    elif exp_to_run == 1:
        includesInst = True
        includesFewShot = False
    else:
        includesInst = True
        includesFewShot = True

    # Randomly select 5 kidney indices with ids 0-19 and 5 candidates with indices 0-99

    for kidney_index in kidney_indices:
        for candidate_index in candidate_indices:

            for j in range(5):      # number of trials for each kidney-candidate problem

                # the process below is for generating binary prompt(s) for a single kidney-candidate problem
                prompt_test_task = get_prompt_test_task(includesInst)
                prompt_test_0 = get_prompt_test_0(kidney_features, candidate_features)

                candidate_binary_info = get_one_candidate_binary_info(candidate_index)
                prompt_test_1 = get_prompt_test_1(candidate_binary_info)

                kidney_info = get_kidney_info(kidney_index)
                prompt_test_2 = get_prompt_test_2(kidney_info)

                # one prompt per kidney-candidate problem, no instructions, no examples
                if exp_to_run == 0:
                    data = [
                        {"turn_number": 0, "prompt_text": prompt_test_task},
                        {"turn_number": 1, "prompt_text": prompt_test_0},
                        {"turn_number": 2, "prompt_text": prompt_test_1},
                        {"turn_number": 3, "prompt_text": prompt_test_2}
                    ]
                    
                    file_path = f"prompts/binary/binary_plain_kidney{kidney_index}_candidate{candidate_index}_trial{j}.jsonl"
                    with open(file_path, "w") as f:
                        for d in data:
                            json.dump(d, f)
                            f.write("\n")
                    print(f"Saved {file_path}")

                # one prompt per kidney-candidate problem, yes instructions, no examples
                elif exp_to_run == 1:
                    data = [
                        {"turn_number": 0, "prompt_text": prompt_test_task},
                        {"turn_number": 1, "prompt_text": prompt_test_0},
                        {"turn_number": 2, "prompt_text": prompt_test_1},
                        {"turn_number": 3, "prompt_text": prompt_test_2}
                    ]

                    file_path = f"prompts/binary/binary_zero_kidney{kidney_index}_candidate{candidate_index}_trial{j}.jsonl"
                    with open(file_path, "w") as f:
                        for d in data:
                            json.dump(d, f)
                            f.write("\n")
                    print(f"Saved {file_path}")

                # i prompts per kidney-candidate problem, yes instructions, yes examples
                else:
                    for i in range(3):              # number of times we change few-shot examples
                        example_candidate, example_kidney, example_justification = get_few_shot_examples(kidney_indices)
                        kidney_example = [get_kidney_info(kidney_index) for kidney_index in example_kidney]
                        candidate_example = [get_one_candidate_binary_info(candidate_index) for candidate_index in example_candidate]
                        prompt_test_few_shot = get_prompt_test_few_shot(kidney_example, candidate_example, example_justification)
                        data = [
                            {"turn_number": 0, "prompt_text": prompt_test_task},
                            {"turn_number": 1, "prompt_text": prompt_test_0},
                            {"turn_number": 2, "prompt_text": prompt_test_few_shot},
                            {"turn_number": 3, "prompt_text": prompt_test_1},
                            {"turn_number": 4, "prompt_text": prompt_test_2}
                        ]

                        file_path = f"prompts/binary/binary_fewshot_kidney{kidney_index}_candidate{candidate_index}_shuffle_{i}_trial{j}.jsonl"
                        with open(file_path, "w") as f:
                            for d in data:
                                json.dump(d, f)
                                f.write("\n")
                        print(f"Saved {file_path}")
                    
            # endfor j trials
        # endfor candidate_index
    # endfor kidney_index
    print("Done with binary prompts for exp_to_run", exp_to_run, "\n")

# endfor exp_to_run
print("All binary prompts generated")

# Ranked

In [3]:
def get_candidates_ranked_info(kidney_index, feature_type=['all', 'demo_only']):
    candidates_df = pd.read_csv("data/candidate-ranked-raw.csv")
    assert 'Candidate ID' in candidates_df.columns, "Candidate ID column is missing in the dataframe"

    filtered_df = candidates_df[candidates_df["Kidney Donor ID"] == kidney_index]
    filtered_df = filtered_df.drop(columns=['Kidney Donor ID'])
    # filtered_df = filtered_df.sample(frac=1, axis=1).reset_index(drop=True)                   # This shuffles the columns

    if feature_type == 'all':
        columns = filtered_df.columns.tolist()
        rows = filtered_df.values.tolist()

        formatted_rows = []
        for row in rows:
            formatted_values = []
            for value in row:
                if pd.isna(value):
                    value_str = ""
                else:
                    value_str = str(value)
                    if ',' in value_str or '"' in value_str:
                        value_str = '"' + value_str.replace('"', '""') + '"'
                formatted_values.append(value_str)
            formatted_rows.append(",".join(formatted_values))
        
        # Randomly shuffle the formatted_rows
        random.shuffle(formatted_rows)
        
        candidates_ranked_info = ",".join(columns) + "\n" + "\n".join(formatted_rows)

    elif feature_type == 'demo_only':
        demo_columns = ['Candidate ID', 'GENDER', 'ETHCAT', 'EDUCATION', 'DAYSWAIT_CHRON_KI', 'PERM_STATE']
        filtered_df_demo = filtered_df[demo_columns]
        # filtered_df_demo = filtered_df_demo.sample(frac=1, axis=1).reset_index(drop=True) # no more shuffling of columns
        demo_columns = filtered_df_demo.columns.tolist()
        demo_rows = filtered_df_demo.values.tolist()

        formatted_rows_demo = []
        for row in demo_rows:
            formatted_values = []
            for value in row:
                if pd.isna(value):
                    value_str = ""
                else:
                    value_str = str(value)
                    if ',' in value_str or '"' in value_str:
                        value_str = '"' + value_str.replace('"', '""') + '"'
                formatted_values.append(value_str)
            formatted_rows_demo.append(",".join(formatted_values))
        
        # Randomly shuffle the formatted_rows_demo
        random.shuffle(formatted_rows_demo)
        
        candidates_ranked_info = ",".join(demo_columns) + "\n" + "\n".join(formatted_rows_demo)
    
    return candidates_ranked_info


In [4]:
def get_prompt_test_0(feature_type = ['all', 'demo_only']):
    if feature_type == 'all':
        # We are tasking you to rank N candidates based on allocation priority for a given kidney. # for Borda
        prompt_test_0 = f"""We are tasking you to select 1 candidate to receive a given kidney.
        Here are some descriptions of the features I'd like you to consider.

        Kidney Features: {kidney_features}
        Candidate Features: {candidate_features}

        Remember this information for the next sets of tasks."""

    if feature_type == 'demo_only':
        # We are tasking you to rank N candidates based on allocation priority for a given kidney. # for Borda
        prompt_test_0 = f"""We are tasking you to select 1 candidate to receive a given kidney.
        Here are some descriptions of the features I'd like you to consider.
        
        Kidney Features: {kidney_features}
        Candidate Features: {candidate_features_demo}

        Remember this information for the next sets of tasks."""

    return prompt_test_0

In [5]:
def get_prompt_test_1(candidates_info):
    prompt_test_1 = f"""Here are the candidate informations in CSV format.

    {candidates_info}

    Assume all candidates are medically viable with the kidney (blood types, size, etc compatible). Remember this dataset for the next sets of tasks."""

    return prompt_test_1

In [6]:
def get_prompt_test_2(kidney_info):
    prompt_test_2 = f"""Here is the kidney information in CSV format.

    {kidney_info}

    Please select the top candidate for the given kidney, based on the patient information you have recieved.
    
    Just output the results by providing the Candidate ID of the one candidate you believe is the best match."""

    # Please rank N candidates based on allocation priority for the given kidney, based on the patient information you have received. # for Borda
    # They can be ranked from 1 to N, where 1 is the highest priority.
    # Please follow the format below:
    # Candidate ID, Ranking (1 to N)
    
    # Just output the results in the format of Candidate ID and Ranking, separated by a comma."""

    return prompt_test_2

In [8]:
# save prompts

for kidney_index in range(20):
    for feature_type in ['all', 'demo_only']:                                   # Run with only demographic columns in 'demo_only' vs both medical and demographic in 'all'
        for i in range(3):                                                      # Randomize the order of the rows 3 times
            prompt_test_0 = get_prompt_test_0(feature_type=feature_type)        # I did NOT randomize the feature ordering in the original definition prompt -- This will be consistent.
            candidates_ranked_info = get_candidates_ranked_info(kidney_index, feature_type=feature_type)    # This contains randomized patient row ordering.

            for j in range (5):                                                 # Repeat each experiment with the same column ordering 5 times -- for consistency.
                prompt_test_1 = get_prompt_test_1(candidates_ranked_info)
                kidney_info = get_kidney_info(kidney_index, "ranked")           # I did NOT randomize the column ordering of the donated kidney information -- This will be consistent.
                prompt_test_2 = get_prompt_test_2(kidney_info)

                data = [
                    {"turn_number": 0, "prompt_text": prompt_test_0},
                    {"turn_number": 1, "prompt_text": prompt_test_1},
                    {"turn_number": 2, "prompt_text": prompt_test_2}
                ]

                """ranked
                _kidney{kidney id}
                _{all, demo_only depending on if all or only demographic features used}
                _shuffle{0-2 to represent the shuffling of patient row ordering}
                _trial{0-4 for repetitions of the same prompt}"""
                
                # file_path = f"prompts/ranked/ranked_kidney{kidney_index}_{feature_type}_shuffle{i}_trial{j}.jsonl" # for Borda
                file_path = f"prompts/ranked-plurality/ranked_kidney{kidney_index}_{feature_type}_shuffle{i}_trial{j}.jsonl"

                with open(file_path, "w") as f:
                    for d in data:
                        json.dump(d, f)
                        f.write("\n")
    
                print(f"Saved {file_path}")


Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle0_trial0.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle0_trial1.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle0_trial2.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle0_trial3.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle0_trial4.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle1_trial0.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle1_trial1.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle1_trial2.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle1_trial3.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle1_trial4.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle2_trial0.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle2_trial1.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle2_trial2.jsonl
Saved prompts/ranked-plurality/ranked_kidney0_all_shuffle2_trial