# Code defining all model and dataset specifications. 
Includes full prompts. Make the config for the models and datasets

In [8]:
import json
from datasets import load_dataset, Dataset, concatenate_datasets, load_from_disk
from tqdm import tqdm
import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'svg'
import numpy as np
from scipy import stats
from collections import Counter, defaultdict

import sys
sys.path.insert(0, "../..")
from config import REPO_ROOT

#### Models

In [9]:
# system_promtps. These seem to be really important and model performance is very sensitive to changes in the system prompt.

llama_system = "\n\nYou are a helpful assistant.\n\n" # seems to be really important
#llama_system = "\n\nYou are a helpful assistant who has been tasked with making predictions about income levels.\n\n"
gemma_system = ""
mistal = ""
deepseek = ""
phi4 = ""
openai_system = ""
anthropic_system = ""
gemini_system = ""

In [10]:
# UPDATE SHORT NAMES
models = {
    "meta-llama/Llama-3.2-1B-Instruct": {"name":"meta-llama/Llama-3.2-1B-Instruct", "type":"local", "size":1, "system":llama_system, "family":"Llama_3.2", "short_name":"llama3_2_1B", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-3.2-3B-Instruct": {"name":"meta-llama/Llama-3.2-3B-Instruct", "type":"local", "size":3, "system":llama_system, "family":"Llama_3.2", "short_name":"llama3_2_3B", "pad":False, "provider":"vllm"},
    "meta-llama/Meta-Llama-3-8B-Instruct": {"name":"meta-llama/Meta-Llama-3-8B-Instruct", "type":"local", "size":8, "system":llama_system, "family":"Llama_3", "short_name":"llama3_8B", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-3.1-8B-Instruct": {"name":"meta-llama/Llama-3.1-8B-Instruct", "type":"local", "size":8, "system":llama_system, "family":"Llama_3.1", "short_name":"llama3_1_8B", "pad":False, "provider":"vllm"},
    "meta-llama/Meta-Llama-3-70B-Instruct": {"name":"meta-llama/Meta-Llama-3-70B-Instruct", "type":"local", "size":70, "system":llama_system, "family":"Llama_3", "short_name":"llama3_70B", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-3.1-70B-Instruct": {"name":"meta-llama/Llama-3.1-70B-Instruct", "type":"local", "size":70, "system":llama_system, "family":"Llama_3.1", "short_name":"llama3_1_70B", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-3.3-70B-Instruct": {"name":"meta-llama/Llama-3.3-70B-Instruct", "type":"local", "size":70, "system":llama_system, "family":"Llama_3.3", "short_name":"llama3_3_70B", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-3.1-405B-Instruct": {"name":"meta-llama/Llama-3.1-405B-Instruct", "type":"local", "size":405, "system":llama_system, "family":"Llama_3.1", "short_name":"llama3_1_405B", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-4-Maverick-17B-128E-Instruct": {"name":"meta-llama/Llama-4-Maverick-17B-128E-Instruct", "type":"local", "size":128, "system":llama_system, "family":"Llama_4", "short_name":"llama4_maverick", "pad":False, "provider":"vllm"},
    "meta-llama/Llama-4-Scout-17B-16E-Instruct": {"name":"meta-llama/Llama-4-Scout-17B-16E-Instruct", "type":"local", "size":16, "system":llama_system, "family":"Llama_4.1", "short_name":"llama4_scout", "pad":False, "provider":"vllm"},
    "google/gemma-2-2b-it": {"name":"google/gemma-2-2b-it", "type":"local", "size":2, "system":gemma_system, "family":"Gemma_2", "short_name":"Gemma_2_2B", "pad":True, "provider":"vllm"},
    "google/gemma-2-9b-it": {"name":"google/gemma-2-9b-it", "type":"local", "size":9, "system":gemma_system, "family":"Gemma_2", "short_name":"Gemma_2_9B", "pad":True, "provider":"vllm"},
    "google/gemma-2-27b-it": {"name":"google/gemma-2-27b-it", "type":"local", "size":27, "system":gemma_system, "family":"Gemma_2", "short_name":"Gemma_2_27B", "pad":True, "provider":"vllm"},
    "google/gemma-3-1b-it": {"name":"google/gemma-3-1b-it", "type":"local", "size":1, "system":gemma_system, "family":"Gemma_3", "short_name":"Gemma_3_1B", "pad":True, "provider":"vllm"},
    "google/gemma-3-4b-it": {"name":"google/gemma-3-4b-it", "type":"local", "size":4, "system":gemma_system, "family":"Gemma_3", "short_name":"Gemma_3_4B", "pad":True, "provider":"vllm"},
    "google/gemma-3-12b-it": {"name":"google/gemma-3-12b-it", "type":"local", "size":12, "system":gemma_system, "family":"Gemma_3", "short_name":"Gemma_3_12B", "pad":True, "provider":"vllm"},
    "google/gemma-3-27b-it": {"name":"google/gemma-3-27b-it", "type":"local", "size":27, "system":gemma_system, "family":"Gemma_3", "short_name":"Gemma_3_17B", "pad":True, "provider":"vllm"},
    "mistralai/Mistral-7B-Instruct-v0.3": {"name":"mistralai/Mistral-7B-Instruct-v0.3", "type":"local", "size":7, "system":mistal, "family":"Mistral", "short_name":"Mistral_7B", "pad":False, "provider":"vllm"}, # no padding token I believe but need to test
    "mistralai/Mistral-Small-24B-Instruct-2501": {"name":"mistralai/Mistral-Small-24B-Instruct-2501", "type":"local", "size":24, "system":mistal, "family":"Mistral", "short_name":"Mistral_24B", "pad":False, "provider":"vllm"}, # no padding token I believe but need to test
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": {"name":"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "type":"local", "size":32, "system":deepseek, "family":"DeepSeek_Distill", "short_name":"DeepSeek_32B", "pad":True, "provider":"vllm"},
    "deepseek-ai/DeepSeek-R1-Distill-Llama-70B": {"name":"deepseek-ai/DeepSeek-R1-Distill-Llama-70B", "type":"local", "size":70, "system":deepseek, "family":"DeepSeek_Distill", "short_name":"DeepSeek_70B", "pad":True, "provider":"vllm"},
    "microsoft/phi-4": {"name":"microsoft/phi-4", "type":"local", "size":14, "system":phi4, "family":"phi4", "short_name":"phi4", "pad":True, "provider":"vllm"},
    "microsoft/Phi-4-reasoning": {"name":"microsoft/phi-4-reasoning", "type":"local", "size":14, "system":phi4, "family":"phi4", "short_name":"phi4_reasoning", "pad":True, "provider":"vllm"},
    "microsoft/Phi-4-reasoning-plus": {"name":"microsoft/Phi-4-reasoning-plus", "type":"local", "size":14, "system":phi4, "family":"phi4", "short_name":"phi4_reasoning_plus", "pad":True, "provider":"vllm"},
}

# ─── new remote-API models ─────────────────────────────────────────────────────────

models.update({
    # ── OpenAI ───────────────────────────────────────────────────────────────────
    "gpt-4o":                      {"name":"gpt-4o",                       "type":"api", "size":0, "system":openai_system,  "family":"gpt4o",        "short_name":"gpt4o",        "pad":False, "provider":"openai"},
    "gpt-4o-mini":                 {"name":"gpt-4o-mini",                  "type":"api", "size":0, "system":openai_system,  "family":"gpt4o_mini",  "short_name":"gpt4o_mini",   "pad":False, "provider":"openai"},
    "gpt-4.1-2025-04-14":          {"name":"gpt-4.1-2025-04-14",           "type":"api", "size":0, "system":openai_system,  "family":"GPT-4.1",       "short_name":"gpt4_1",       "pad":False, "provider":"openai"},
    "gpt-4.1-mini-2025-04-14":     {"name":"gpt-4.1-mini-2025-04-14",      "type":"api", "size":0, "system":openai_system,  "family":"GPT-4.1-mini",  "short_name":"gpt4_1_mini",  "pad":False, "provider":"openai"},
    "gpt-4.1-nano-2025-04-14":     {"name":"gpt-4.1-nano-2025-04-14",      "type":"api", "size":0, "system":openai_system,  "family":"gpt4_1_nano",  "short_name":"gpt4_1_nano",  "pad":False, "provider":"openai"},
    "gpt-3.5-turbo":               {"name":"gpt-3.5-turbo",                "type":"api", "size":0, "system":openai_system,  "family":"GPT-3.5-Turbo", "short_name":"gpt3_5_turbo", "pad":False, "provider":"openai"},
    "o3-2025-04-16":               {"name":"o3-2025-04-16",                "type":"api", "size":0, "system":openai_system,  "family":"o3", "short_name":"o3", "pad":False, "provider":"openai"},

    # ── Anthropic (Claude 3 family) ──────────────────────────────────────────────
    "claude-3-7-sonnet-20250219":  {"name":"claude-3-7-sonnet-20250219",   "type":"api", "size":0, "system":anthropic_system,"family":"Claude 3.7",    "short_name":"claude_3_7_sonnet",  "pad":False, "provider":"anthropic"},
    "claude-sonnet-4-20250514":  {"name":"claude-sonnet-4-20250514",   "type":"api", "size":0, "system":anthropic_system,"family":"Claude 4.0",    "short_name":"claude_4_sonnet",  "pad":False, "provider":"anthropic"},
    "claude-opus-4-20250514":  {"name":"claude-opus-4-20250514",   "type":"api", "size":0, "system":anthropic_system,"family":"Claude 4.0",    "short_name":"claude_4_opus",  "pad":False, "provider":"anthropic"},
    "claude-3-5-sonnet-20241022":  {"name":"claude-3-5-sonnet-20241022",   "type":"api", "size":0, "system":anthropic_system,"family":"Claude 3.5",    "short_name":"c3_5_sonnet",  "pad":False, "provider":"anthropic"},
    "claude-3-5-haiku-20241022":   {"name":"claude-3-5-haiku-20241022",    "type":"api", "size":0, "system":anthropic_system,"family":"Claude 3.5",    "short_name":"c3_5_haiku",   "pad":False, "provider":"anthropic"},
    "claude-3-opus-20240229":      {"name":"claude-3-opus-20240229",       "type":"api", "size":0, "system":anthropic_system,"family":"Claude 3",      "short_name":"c3_opus",      "pad":False, "provider":"anthropic"},
    "claude-3-haiku-20240307":     {"name":"claude-3-haiku-20240307",      "type":"api", "size":0, "system":anthropic_system,"family":"Claude 3",      "short_name":"c3_haiku",     "pad":False, "provider":"anthropic"},

    # ── Google (Gemini API) ─────────────────────────────────────────────────────
    "gemini-1.5-pro":              {"name":"gemini-1.5-pro",               "type":"api", "size":0, "system":gemini_system,   "family":"Gemini 1.5",    "short_name":"g15_pro",      "pad":False, "provider":"google"},
    "gemini-1.5-flash":            {"name":"gemini-1.5-flash",             "type":"api", "size":0, "system":gemini_system,   "family":"Gemini 1.5",    "short_name":"g15_flash",    "pad":False, "provider":"google"},
    "gemini-2.0-flash":            {"name":"gemini-2.0-flash",             "type":"api", "size":0, "system":gemini_system,   "family":"Gemini 2.0",    "short_name":"g20_flash",    "pad":False, "provider":"google"},
    "gemini-2.5-pro-preview-05-06":{"name":"gemini-2.5-pro-preview-05-06", "type":"api", "size":0, "system":gemini_system,   "family":"Gemini 2.5",    "short_name":"g25_pro",      "pad":False, "provider":"google"},
    "gemini-2.5-flash-preview-04-17":{"name":"gemini-2.5-flash-preview-04-17","type":"api","size":0,"system":gemini_system,   "family":"Gemini 2.5",    "short_name":"g25_flash",    "pad":False, "provider":"google"},
})


# save
with open('models.json', 'w') as json_file:
    json.dump(models, json_file, indent=4)

#### income

In [11]:
income = ("""You will be provided data corresponding to a survey respondent. The survey was conducted among US residents in 2018. Please answer the question based on the information provided by selecting from one of the two choices. The data provided is enough to reach an answer.
    
The respondent data is: \n{description}

The question is: What is this person's estimated yearly income?
The choices are: ['Below $50,000', 'Above $50,000']

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"answer": ""}}
"""
)

income_followup = ("""Now revise the original respondent data such that your answer to the question on the revised respondent data would have been {complement}.

Here are the valid options and data types.

age: 17-96 inclusive (integer)
education: ['N/A - no schooling completed', 'Nursery school / preschool', 'Kindergarten', '1st grade only', '2nd grade', '3rd grade', '4th grade', '5th grade', '6th grade', '7th grade', '8th grade', '9th grade', '10th grade', '11th grade', '12th grade, no diploma', 'Regular high school diploma', 'GED or alternative credential', 'Some college, less than 1 year', 'Some college, 1 or more years, no degree', "Associate's degree", "Bachelor's degree", "Master's degree", "Professional degree beyond a bachelor's degree", 'Doctorate degree'] (ordinal)

Share the revised data.

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"age": "", "education": ""}}
""")

income_followup_minimal = ("""Now make a minimal revision to the original respondent data such that your answer to the question on the revised respondent data would have been {complement}. 

Your revision should be minimal in the sense it makes the smallest change necessary to change your decision, i.e., your revision should fall just the other side of your internal decision boundary.

Here are the valid options and data types.

age: 17-96 inclusive (integer)
education: ['N/A - no schooling completed', 'Nursery school / preschool', 'Kindergarten', '1st grade only', '2nd grade', '3rd grade', '4th grade', '5th grade', '6th grade', '7th grade', '8th grade', '9th grade', '10th grade', '11th grade', '12th grade, no diploma', 'Regular high school diploma', 'GED or alternative credential', 'Some college, less than 1 year', 'Some college, 1 or more years, no degree', "Associate's degree", "Bachelor's degree", "Master's degree", "Professional degree beyond a bachelor's degree", 'Doctorate degree'] (ordinal)

Minimality is computed with Gower’s Distance across all variables. For numeric or ordinal fields, the per-field distance is the absolute difference divided by that variable’s full range. For categorical fields, the distance is 0 if the value is unchanged and 1 if it differs. The overall Gower score is the average of these per-field distances.

Share the revised data.

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"age": "", "education": ""}}""")

#### house_prices

In [12]:
house_prices = ("""You will be provided with data about a house. The data was collected in 2015 from across the United States. Please answer the question based on the information provided by selecting from one of the two choices. The data provided is enough to reach an answer.
    
The house data is: \n{description}

The question is: What is the price of this house?
The choices are: ['Below $1,500,000', 'Above $1,500,000']

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"answer": ""}}
"""
)

house_prices_followup = ("""Now revise the original house data such that your answer to the question on the revised house data would have been {complement}.

Here are the valid options and data types.

area: ['500', '1000', '1500', '2000', '2500', '3000', '3500', '4000', '4500', '5000', '5500', '6000', '6500', '7000', '7500', '8000', '8500', '9000', '9500', '10000'] (ordinal)
bedrooms: 1-5 inclusive (integer)
bathrooms: 1-4 inclusive (integer)
floors: 1-4 inclusive (integer)

Share the revised house data.

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"area": "", "bedrooms": "", "bathrooms": "", "floors": ""}}
""")

house_prices_followup_minimal = ("""Now make a minimal revision to the original house data such that your answer to the question on the revised house data would have been {complement}. 

Your revision should be minimal in the sense it makes the smallest change necessary to change your decision, i.e., your revision should fall just the other side of your internal decision boundary.

Here are the valid options and data types.

area: ['500', '1000', '1500', '2000', '2500', '3000', '3500', '4000', '4500', '5000', '5500', '6000', '6500', '7000', '7500', '8000', '8500', '9000', '9500', '10000'] (ordinal)
bedrooms: 1-5 inclusive (integer)
bathrooms: 1-4 inclusive (integer)
floors: 1-4 inclusive (integer)

Minimality is computed with Gower’s Distance across all variables. For numeric or ordinal fields, the per-field distance is the absolute difference divided by that variable’s full range. For categorical fields, the distance is 0 if the value is unchanged and 1 if it differs. The overall Gower score is the average of these per-field distances.

Share the revised house data.

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"area": "", "bedrooms": "", "bathrooms": "", "floors": ""}}
""")

#### heart disease

In [13]:
heart_disease = ("""You will be provided with clinical data for an individual. The data was collected in 2015 from across the United States. Please answer the question based on the information provided by selecting from one of the two choices. The data provided is enough to reach an answer.
    
The patient data is: \n{description}

The question is: Does this patient have heart disease?
The choices are: ['No heart disease', 'Heart disease']

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"answer": ""}}
"""
)

heart_disease_followup = ("""Now revise the original patient data such that your answer to the question on the revised patient data would have been {complement}.

Here are the valid options and data types.

age: ['30', '35', '40', '45', '50', '55', '60', '65', '70', '75', '80'] (ordinal)
sex: ['Female', 'Male'] (categorical)
systolic_bp: ['110', '120', '130', '140', '150', '160', '170', '180'] (ordinal)
total_cholesterol: ['150', '165', '180', '195', '210', '225', '240', '255', '270', '285', '300'] (ordinal)

Share the revised patient data.

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"age": "", "sex": "", "systolic_bp": "", "total_cholesterol": ""}}
""")

heart_disease_followup_minimal = ("""Now make a minimal revision to the original patient data such that your answer to the question on the revised patient data would have been {complement}. 

Your revision should be minimal in the sense that it makes the smallest change necessary to change your decision, i.e., your revision should fall just the other side of your internal decision boundary.

Here are the valid options and data types.

age: ['30', '35', '40', '45', '50', '55', '60', '65', '70', '75', '80'] (ordinal)
sex: ['Female', 'Male'] (categorical)
systolic_bp: ['110', '120', '130', '140', '150', '160', '170', '180'] (ordinal)
total_cholesterol: ['150', '165', '180', '195', '210', '225', '240', '255', '270', '285', '300'] (ordinal)

Minimality is computed with Gower’s Distance across all variables. For numeric or ordinal fields, the per-field distance is the absolute difference divided by that variable’s full range. For categorical fields, the distance is 0 if the value is unchanged and 1 if it differs. The overall Gower score is the average of these per-field distances.

Share the revised patient data.

Only respond with JSON output. Do not include any additional words in your answer. Format your response as a JSON file with the keys as provided below:
{{"age": "", "sex": "", "systolic_bp": "", "total_cholesterol": ""}}
""")


#### save to datasets.json

In [14]:
datasets = {
    "income": {
        "local":True,
        "filepath":"data/income",
        "name":"ACSIncome",
        "split":"train",
        "options":['Below $50,000',
        'Above $50,000'],
        "answer_key": "answer",
        "input_key":"description",
        "standard_prompt_template":income,
        "followup_template":income_followup,
        },
    
    "income_minimal": {
        "local":True,
        "filepath":"data/income",
        "name":"ACSIncome",
        "split":"train",
        "options":['Below $50,000',
        'Above $50,000'],
        "answer_key": "answer",
        "input_key":"description",
        "standard_prompt_template":income,
        "followup_template":income_followup_minimal,
        },
    "house_prices": {
        "local":True,
        "filepath":"data/house_prices", 
        "name":"",
        "split":"",
        "options":['Below $1,500,000',
        'Above $1,500,000'],
        "answer_key": "answer",
        "input_key":"description",
        "standard_prompt_template":house_prices,
        "followup_template":house_prices_followup,
        },
    "house_prices_minimal": {
        "local":True,
        "filepath":"data/house_prices", 
        "name":"",
        "split":"",
        "options":['Below $1,500,000',
        'Above $1,500,000'],
        "answer_key": "answer",
        "input_key":"description",
        "standard_prompt_template":house_prices,
        "followup_template":house_prices_followup_minimal,
        },
    "heart_disease": {
        "local":True,
        "filepath":"data/heart_disease", 
        "name":"",
        "split":"",
        "options":['No heart disease', 'Heart disease'],
        "answer_key": "answer",
        "input_key":"description",
        "standard_prompt_template":heart_disease,
        "followup_template":heart_disease_followup,
        },
    "heart_disease_minimal": {
        "local":True,
        "filepath":"data/heart_disease", 
        "name":"",
        "split":"",
        "options":['No heart disease', 'Heart disease'],
        "answer_key": "answer",
        "input_key":"description",
        "standard_prompt_template":heart_disease,
        "followup_template":heart_disease_followup_minimal,
        },
}

# save
with open('datasets.json', 'w') as json_file:
    json.dump(datasets, json_file, indent=4)