This notebook will
1. reorganize the original PRISM dataset into "prism_data_user.json" and "prism_data_dialog.json";
2. split the train/test dataset in "prism_split_ids.json";
3. select 50 dialogues for win-rate evaluation in "selected_examples.json".

Please set the corresponding parameters below.

In [None]:
# should correspond to parameters in training/evaluation
max_text_length = 2300
max_prompt_string_length = 1400
seed=123
original_prism_data_path = 'your/downloaded/original/prism/data/path'
# by default, the preprocessed prism data will be saved in ./data

In [2]:
import os
import json
import numpy as np
np.random.seed(seed=123)

1. Reorganize

In [3]:
from pydantic import BaseModel
from typing import List, Optional, Dict

class Demographics(BaseModel):
    self_description: str
    preference: List[str] = []
    age: str
    gender: str
    education: str
    employment: str
    marital: str
    english_proficiency: str

class UserInfo(BaseModel):
    user_id: str
    dialog_ids: List[str] = []
    demographics: Demographics
    system_string: str

class DataUser(BaseModel):
    data: Dict[str, UserInfo] = {}

class Turn(BaseModel):
    turn_nb: int
    user_utterance: List[str] = []
    chosen_utterance: List[str] = []
    rejected_utterance: List[str] = []

class DialogInfo(BaseModel):
    dialog_id: str
    user_id: str
    turns: List[Optional[Turn]] = []
    total_turn_nb: int
    open_feedback: str = ""

class DataDialog(BaseModel):
    data: Dict[str, DialogInfo] = {}

In [4]:
# reorganize user related data, skip num_completed_conversations==0
data_user = DataUser()

with open (os.path.join(original_prism_data_path, "survey.jsonl"), 'r') as f:
    for line in f:
        d = json.loads(line)
        if d["num_completed_conversations"] == 0:
            continue
        data_user.data[d["user_id"]] = UserInfo(
            user_id = d["user_id"],
            demographics =  Demographics(
                self_description = d["self_description"],
                preference = [k for k, v in d["order_stated_prefs"].items() if v in [1,2,3]],
                age = d["age"],
                gender = d["gender"],
                education = d["education"],
                employment = d["employment_status"],
                marital = d["marital_status"],
                english_proficiency = d["english_proficiency"]
            ),
            system_string = d["system_string"]
        )


In [5]:
# reorganize dialog related data
data_dialog = DataDialog()

with open (os.path.join(original_prism_data_path, "conversations.jsonl"), 'r') as f:
    for line in f:
        d = json.loads(line)
        data_user.data[d["user_id"]].dialog_ids.append(d["conversation_id"])
        data_dialog.data[d["conversation_id"]] = DialogInfo(
            dialog_id = d["conversation_id"],
            user_id = d["user_id"],
            total_turn_nb = d["conversation_turns"],
            turns = [None for _ in range(d["conversation_turns"])],
            open_feedback = d["open_feedback"]
        )
        for utterance in d["conversation_history"]:
            # first utterance of a turn
            if data_dialog.data[d["conversation_id"]].turns[utterance["turn"]] is None:
                data_dialog.data[d["conversation_id"]].turns[utterance["turn"]] = Turn(
                    turn_nb = utterance["turn"]
                )
            # identify role
            if utterance["role"] == "user":
                data_dialog.data[d["conversation_id"]].turns[utterance["turn"]].user_utterance.append(utterance["content"])
            elif utterance["if_chosen"]:
                data_dialog.data[d["conversation_id"]].turns[utterance["turn"]].chosen_utterance.append(utterance["content"])
            else:
                data_dialog.data[d["conversation_id"]].turns[utterance["turn"]].rejected_utterance.append(utterance["content"])

In [6]:
# convert to dict
data_dialog = data_dialog.dict()["data"]
data_user = data_user.dict()["data"]

In [7]:
# filter out users with no qualified example
dialog_ids = list(data_dialog.keys())
for dialog_id in dialog_ids:
    qualified_num = data_dialog[dialog_id]["total_turn_nb"]
    for turn in data_dialog[dialog_id]["turns"]:
        if (turn['user_utterance'] == [] 
            or turn['chosen_utterance'] == [] 
            or turn['rejected_utterance'] == [] 
            or len(turn["user_utterance"][0]) + len(turn["chosen_utterance"][0]) > max_text_length
        ):
            qualified_num -= 1
        else:
            for rejected in turn["rejected_utterance"]:
                if len(turn["user_utterance"][0]) + len(rejected) > max_text_length:
                    qualified_num -= 1
                    break
    # only delete when the whole dialogue is not qualifed
    if qualified_num == 0:
        print("delete dialogue", dialog_id, "by", data_dialog[dialog_id]["user_id"])
        data_user[data_dialog[dialog_id]["user_id"]]["dialog_ids"].remove(dialog_id)
        if data_user[data_dialog[dialog_id]["user_id"]]["dialog_ids"] == []:
            print("delete user", data_dialog[dialog_id]["user_id"])
            del data_user[data_dialog[dialog_id]["user_id"]]
        del data_dialog[dialog_id]
        


delete dialogue c2536 by user469
delete dialogue c4146 by user779


In [8]:
# save as json
with open ("./prism_data_user.json", 'w') as f:
    json.dump(data_user, f, indent=4)

with open ("./prism_data_dialog.json", 'w') as f:
    json.dump(data_dialog, f, indent=4)

2. Split train/test

In [9]:
# split users
import numpy as np
np.random.seed(seed=123)

user_ids = np.array(list(data_user.keys()))
np.random.shuffle(user_ids)
seen_user_ids_init = user_ids[:int(len(user_ids)*0.9)]
unseen_user_ids_init = user_ids[int(len(user_ids)*0.9):]

In [10]:
# split seen users' dialogs into train/test; add unseen to test
train_dialog_ids = np.array([])
test_dialog_ids = np.array([])

seen_user_ids = []
unseen_user_ids = []

for user_id in seen_user_ids_init:
    to_choose_from = np.array(data_user[user_id]["dialog_ids"])
    np.random.shuffle(to_choose_from)
    train_dialog_ids = np.concatenate((train_dialog_ids, to_choose_from[:int(len(to_choose_from)*0.9)]))
    test_dialog_ids = np.concatenate((test_dialog_ids, to_choose_from[int(len(to_choose_from)*0.9):]))
    # move users with no dialog in train to unseen, because int(1*0.9)=0
    if len(to_choose_from) > 1:
        seen_user_ids.append(user_id)
    else:
        unseen_user_ids.append(user_id)

for user_id in unseen_user_ids_init:
    test_dialog_ids = np.concatenate((test_dialog_ids, np.array(data_user[user_id]["dialog_ids"])))
    unseen_user_ids.append(user_id)

In [11]:
print(len(seen_user_ids))
print(len(unseen_user_ids))

1234
162


In [12]:
# save as json, assign our user ids, 0=unseem, 1...=seen
split_ids = {"train_dialog_ids": list(train_dialog_ids),
             "test_dialog_ids": list(test_dialog_ids),
             "seen_user_ids": {k:i+1 for i, k in enumerate(seen_user_ids)},
             "unseen_user_ids": {k: 0 for k in unseen_user_ids}
            }
with open ("./prism_split_ids.json", 'w') as f:
    json.dump(split_ids, f, indent=4)

Select prompts：select 50 dialogues in test split from 25 seen users & 25 unseen users.

In [13]:
# find dialogues in test split and organize into {user_id: [dialog_id]}
test_dict = {}

for dialog_id in split_ids["test_dialog_ids"]:
    user_id = data_dialog[dialog_id]["user_id"]
    if user_id not in test_dict.keys():
        test_dict[user_id] = [dialog_id]
    else:
        test_dict[user_id].append(dialog_id)

In [14]:
# select 25 seen & 25 unseen users
import random

seen_users_keys = list(split_ids["seen_user_ids"].keys())  # key = original user id in prism dataset
random.shuffle(seen_users_keys)
seen_25_user_ids = {k: split_ids["seen_user_ids"][k] for k in seen_users_keys[:25]}

unseen_users_keys = list(split_ids["unseen_user_ids"])
random.shuffle(unseen_users_keys)
unseen_25_user_ids = {k: split_ids["unseen_user_ids"][k] for k in unseen_users_keys[:25]}

In [15]:
# select 1 dialog from each user
seen_dialogs_from25 = []
unseen_dialogs_from25 = []

for user_id in seen_25_user_ids:
    seen_dialogs_from25.append(random.choice(test_dict[user_id]))
    
for user_id in unseen_25_user_ids:
    unseen_dialogs_from25.append(random.choice(test_dict[user_id]))

In [16]:
class SelectedTurn(BaseModel):
    history: str
    user_utterance: str
    chosen_utterance: str
    rejected_utterance: str
    turn_nb: int

class Demographics(BaseModel):
    self_description: str
    preference: List[str] = []
    age: str
    gender: str
    education: str
    employment: str
    marital: str
    english_proficiency: str

class SelectedDialog(BaseModel):
    user_id: str
    our_id: int
    dialog_id: str
    turns: Dict[int, SelectedTurn] = {}  # turn_nb can have skips
    open_feedback: str
    demographics: Demographics
    system_string: str

class SelectedExamples(BaseModel):
    data: Dict[str, SelectedDialog] = {}
    

In [17]:
selected_examples = SelectedExamples()

dialog_ids = seen_dialogs_from25 + unseen_dialogs_from25
user_ids = seen_25_user_ids
user_ids.update(unseen_25_user_ids)

total_turn_num = 0
seen_turn_num = 0

for dialog_id in dialog_ids:
    selected_dialog = SelectedDialog(
        user_id = data_dialog[dialog_id]["user_id"],
        our_id = user_ids[data_dialog[dialog_id]["user_id"]],
        dialog_id = dialog_id,
        turns = {},
        open_feedback = data_dialog[dialog_id]["open_feedback"],
        demographics = Demographics(**data_user[data_dialog[dialog_id]["user_id"]]["demographics"]),
        system_string = data_user[data_dialog[dialog_id]["user_id"]]["system_string"]
    )

    history = ""
    for turn in data_dialog[dialog_id]["turns"]:
        # add user utterance to history
        history += f"<|start_header_id|>user<|end_header_id|>\n\n{turn['user_utterance'][0]}<|eot_id|>\n"

        # prepare examples, skip empty or too long examples
        if (    turn['user_utterance'] != [] 
            and turn['chosen_utterance'] != [] 
            and len(turn["user_utterance"][0]) + len(turn["chosen_utterance"][0]) < max_text_length
        ):  
            selected_turn = SelectedTurn(
                history = history,
                user_utterance = turn["user_utterance"][0],
                chosen_utterance = turn["chosen_utterance"][0],
                rejected_utterance = "" if turn["rejected_utterance"] == [] else turn["rejected_utterance"][0],
                turn_nb = turn["turn_nb"]
            )
            selected_dialog.turns[turn["turn_nb"]] = selected_turn
            
            total_turn_num += 1
            if dialog_id in seen_dialogs_from25:
                seen_turn_num += 1

        # add the first chosen utterance to history for next turn
        if turn['chosen_utterance'] != []:
            history += f"<|start_header_id|>assistant<|end_header_id|>\n\n{turn['chosen_utterance'][0]}<|eot_id|>\n"

    selected_examples.data[dialog_id] = selected_dialog

In [18]:
with open ("./prism_selected_examples.json", 'w') as f:
   json.dump(selected_examples.dict()["data"], f, indent=4)