In [106]:
from datasets import load_dataset
import json

ds = load_dataset('LLM-Digital-Twin/Twin-2K-500', 'full_persona')
N_PERSON = len(ds['data'])
data = [json.loads(ds['data'][i]['persona_json']) for i in range(N_PERSON)]
N_BLOCKS = len(data[0])

inspect questions

In [109]:
first_person = data[0]
cnt_questions = 0
for idx, block in enumerate(first_person):
    print(f"Block {idx}: {block['BlockName']} | number of questions: {len(block['Questions'])}")
    cnt_questions += len(block['Questions'])
print(f"Total number of questions: {cnt_questions}")

Block 0: Demographics | number of questions: 14
Block 1: Personality  | number of questions: 8
Block 2: Cognitive tests  | number of questions: 48
Block 3: Economic preferences - intro  | number of questions: 1
Block 4: Economic preferences  | number of questions: 11
Block 5: Personality  | number of questions: 8
Block 6: Cognitive tests  | number of questions: 21
Block 7: Forward Flow | number of questions: 1
Block 8: Economic preferences - intro  | number of questions: 1
Block 9: Economic preferences  | number of questions: 23
Block 10: Personality  | number of questions: 33
Block 11: Cognitive tests  | number of questions: 1
Block 12: Economic preferences  | number of questions: 2
Block 13: False consensus  | number of questions: 1
Block 14: Base-rate 70 engineers | number of questions: 1
Block 15: Disease-loss | number of questions: 1
Block 16: Linda-conjunction | number of questions: 1
Block 17: Outcome bias - success  | number of questions: 1
Block 18: Anchoring - African countri

In [110]:
question_dict = {}

for block_id, block in enumerate(first_person):
    questions = block['Questions']
    for question in questions:
            qid = question['QuestionID']
            qtext = question['QuestionText']
            qtype = question['QuestionType']
            qoptions = question['Options'] if 'Options' in question else None
            if qoptions is None:
                  continue
            question_dict[f'BLOCK{block_id}' + qid + "_W1"] = {
                'text': qtext,
                'type': qtype,
                'options': qoptions
            }
print(f"Total number of questions with options: {len(question_dict)}")

Total number of questions with options: 160


inspect questions (optional)

In [None]:
for question in question_dict.values():
    print("Text: ", question['text'])
    print("Type: ", question['type'])
    print("Options: ", question['options'])
    print("=====")

qkeys, qstrings

In [112]:
qkeys = list(question_dict.keys())
assert len(qkeys) == len(set(qkeys))
with open('twin_qkeys.json', 'w') as f:
    json.dump(qkeys, f, indent=4)

qstrings = {qid: question_dict[qid]['text'] for qid in qkeys}
with open('twin_qstrings.json', 'w') as f:
    json.dump(qstrings, f, indent=4)

qwoptions

In [113]:
def question_formatter(qtext, options):
    format_str = f"{qtext}\n"
    for i, option in enumerate(options):
        format_str += chr(ord('A') + i) + ". " + option.strip() + "\n"
    format_str += "\nAnswer: "
    return_strs = []
    for i, option in enumerate(options):
        return_strs.append((format_str + chr(ord('A') + i)).strip())
    return return_strs

qwoptions = {}
options_map = {}
for qkey, question in question_dict.items():
    qtext = question['text']
    options = question['options']
    qformatted = question_formatter(qtext, options)
    for i in range(len(options)):
        qwoptions[f"{qkey}_option_{i+1}"] = qformatted[i]
    options_map[qkey] = {
        float(i+1): options[i] for i in range(len(options))
    }

with open('twin_qwoptions.json', 'w') as f:
    json.dump(qwoptions, f, indent=4)
with open('twin_options_map.json', 'w') as f:
    json.dump(options_map, f, indent=4)

process data into tabular form

In [120]:
import pandas as pd

all_rows = []
row_ids = []
multiple_choice_cnt = 0

for i in range(N_PERSON):
    person = json.loads(ds["data"][i]["persona_json"])
    answers_for_person = {}
    for block_id, block in enumerate(person):
        for q in block.get("Questions", []):
            qid = f'BLOCK{block_id}' + q.get("QuestionID") + "_W1"
            if qid in question_dict:
                options = question_dict[qid]["options"]
                person_answer_ind = q['Answers'].get('SelectedByPosition')
                person_answer_text = q['Answers'].get('SelectedText')
                if person_answer_ind is None or person_answer_text is None:
                    import pdb; pdb.set_trace()
                if isinstance(person_answer_ind, list):
                    if len(person_answer_ind) == 1:
                        person_answer_ind = person_answer_ind[0]
                        person_answer_text = person_answer_text[0]
                    else:
                        multiple_choice_cnt += 1
                        person_answer_ind = person_answer_ind[0]
                        person_answer_text = person_answer_text[0]
                if not (
                    isinstance(person_answer_ind, int)
                    and 1 <= person_answer_ind <= len(options)
                    and options[person_answer_ind-1] == person_answer_text
                ):
                    import pdb; pdb.set_trace()
                answers_for_person[qid] = person_answer_ind
    answers_for_person['WEIGHT_W1'] = 1.0

    row = {qid: answers_for_person.get(qid, pd.NA) for qid in qkeys}
    row['WEIGHT_W1'] = answers_for_person.get('WEIGHT_W1', pd.NA)
    all_rows.append(row)
    row_ids.append(10000001 + i)

df = pd.DataFrame(all_rows, index=row_ids)
df.index.name = "QKEY"

# Optional: save
df.to_csv("twin_responses.csv")

print(df.shape)

(2058, 161)


df inspection

In [121]:
df

Unnamed: 0_level_0,BLOCK0QID11_W1,BLOCK0QID12_W1,BLOCK0QID13_W1,BLOCK0QID14_W1,BLOCK0QID15_W1,BLOCK0QID16_W1,BLOCK0QID17_W1,BLOCK0QID18_W1,BLOCK0QID19_W1,BLOCK0QID20_W1,...,BLOCK30QID9_32_W1,BLOCK30QID9_33_W1,BLOCK30QID9_34_W1,BLOCK30QID9_35_W1,BLOCK30QID9_36_W1,BLOCK30QID9_37_W1,BLOCK30QID9_38_W1,BLOCK30QID9_39_W1,BLOCK30QID9_40_W1,WEIGHT_W1
QKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000001,3,1,1,3,1,1,6,1,3,1,...,2,1,2,2,2,2,1,2,1,1.0
10000002,2,2,3,5,1,1,1,12,5,2,...,2,2,2,1,1,1,1,2,2,1.0
10000003,3,2,3,6,1,1,1,1,3,1,...,2,1,2,1,1,2,1,1,1,1.0
10000004,4,2,2,4,1,1,4,11,6,3,...,2,2,1,2,1,2,2,2,1,1.0
10000005,2,2,3,2,2,1,6,11,5,1,...,1,2,2,2,2,2,1,2,2,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10002054,2,1,1,5,1,1,6,1,5,2,...,2,2,2,1,2,2,1,2,2,1.0
10002055,3,1,2,4,5,1,1,12,4,2,...,1,2,2,2,2,1,1,2,2,1.0
10002056,4,1,1,3,1,1,6,2,4,1,...,2,2,2,2,1,2,2,1,2,1.0
10002057,2,1,3,5,1,1,1,1,5,2,...,1,2,2,1,2,1,1,2,1,1.0


what is random accuracy?

In [123]:
random_accs = []
for qkey in question_dict.keys():
    if qkey.startswith("BLOCK0QID"):
        continue
    options = question_dict[qkey]['options']
    n_options = len(options)
    random_accs.append(1.0 / n_options)

print("Average random accuracy (excluding BLOCK0 questions): ", sum(random_accs) / len(random_accs))

Average random accuracy (excluding BLOCK0 questions):  0.35050945059506705


LLM inference and training sampler