In [1]:
import urllib.request, json 
import string, re
import random

In [107]:
# load original CC dataset
data_path = "data/compositional_celebrities_original.json"
with open(data_path, 'r') as f:
    data_with_canary = json.load(f)
data = data_with_canary["data"]
len(data)

8693

In [106]:
categories = []
for dp in data:
  if dp['category'] in categories:
    continue
  else:
    categories.append(dp['category'])
print(categories)
print(len(categories))

['birthplace_capital', 'birthplace_rounded_lat', 'birthplace_rounded_lng', 'birthplace_tld', 'birthplace_ccn3', 'birthplace_currency', 'birthplace_currency_short', 'birthplace_currency_symbol', 'birthplace_jpn_common_name', 'birthplace_spa_common_name', 'birthplace_rus_common_name', 'birthplace_est_common_name', 'birthplace_urd_common_name', 'birthplace_callingcode', 'birthyear_nobelLiterature', 'birthdate_uspresident', 'birthyear_masterchamp']
17


In [109]:
# edit phrasing of birthplace questions
for dp in data:
  if "birthplace of" in dp["Question"]:
    dp["Question"] = dp["Question"].replace("birthplace of", "birth country of")

data_edited_path = "data/compositional_celebrities_edited.json"
with open(data_edited_path, "w") as json_file:
    json.dump({"data": data, "canary": data_with_canary["canary"]}, json_file, ensure_ascii=False, indent=4)

In [110]:
category_to_prompt_phrase = {'birthplace_rounded_lng':'(rounded down) longitude of', 'birthplace_capital': "capital of", 'birthyear_nobelLiterature':'winner of the Nobel Prize in Literature in', 'birthplace_currency':'currency of', 'birthplace_ccn3':'ISO 3166-1 numeric code of', 'birthdate_uspresident':'President of the United States on', 'birthplace_urd_common_name':'name in Urdu of', 'birthplace_rus_common_name':'name in Russian of', 'birthplace_callingcode':'calling code of', 'birthyear_masterchamp':'champion of the Masters Tournament in', 'birthplace_currency_short':'currency abbreviation of', 'birthplace_spa_common_name':'name in Spanish of', 'birthplace_tld':'top-level domain of', 'birthplace_rounded_lat':'(rounded down) latitude of', 'birthplace_jpn_common_name':'name in Japanese of', 'birthplace_currency_symbol':'currency symbol of', 'birthplace_est_common_name':'name in Estonian of'}
category_to_type = {'birthplace_rounded_lng':'a rounded down longitude', 'birthplace_capital': "a capital city", 'birthyear_nobelLiterature':'a nobel prize winner', 'birthplace_currency':'a currency', 'birthplace_ccn3':'an ISO 3166-1 numeric code', 'birthdate_uspresident':'a US president', 'birthplace_urd_common_name':'an Urdu name', 'birthplace_rus_common_name':'a Russian name', 'birthplace_callingcode':'a calling code', 'birthyear_masterchamp':'a Masters champion', 'birthplace_currency_short':'a currency abbreviation', 'birthplace_spa_common_name':'a Spanish name', 'birthplace_tld':'a top-level domain', 'birthplace_rounded_lat':'a rounded down latitude', 'birthplace_jpn_common_name':'a Japanese name', 'birthplace_currency_symbol':'a currency symbol', 'birthplace_est_common_name':'an Estonian name'}

In [92]:
# extract questions for use in prompts
samples_by_category = {}
shots = 2
random.seed(481)
for category in categories:
  category_questions = [dp for dp in data if dp['category'] == category]
  samples = random.sample(category_questions, shots)
  for sample in samples:
    data.remove(sample)
  samples_by_category[category] = samples
print(len(data))

8659


In [111]:
random.seed(481)
data_subset = random.sample(data, 1000)
data_subset_path = "data/compositional_celebrities_subset.json"
with open(data_subset_path, "w") as json_file:
    json.dump({"data": data_subset, "canary": data_with_canary["canary"]}, json_file, ensure_ascii=False, indent=4)

1000


In [93]:
#generate a chain of thought prompt for each category
def make_chain_of_thought_prompts():
  prompts_by_category = dict()
  for category in samples_by_category.keys():
    chain_of_thought = ''
    for idx,sample in enumerate(samples_by_category[category]):
      if idx != 0:
        chain_of_thought += '\n\n'
      chain_of_thought += 'Question: '
      chain_of_thought += sample['Question']
      chain_of_thought += '\nAnswer: '
      if 'birthplace' in category:
        chain_of_thought += 'The birthplace (country) of ' + sample['person'] + ' is '
      elif 'year' in category:
        chain_of_thought += 'The year of birth of ' + sample['person'] + ' is '
      else:
        chain_of_thought += 'The date of birth of ' + sample['person'] + ' is '
      chain_of_thought+= str(sample['A1'][0]) + '. The '
      chain_of_thought+= category_to_prompt_phrase[category] + ' ' + str(sample['A1'][0]) + (' is ' if 'birthplace' in category else ' was ')
      chain_of_thought+= str(sample['A2'][0]) + '.\nSo the final answer ('
      chain_of_thought+= category_to_type[category]
      chain_of_thought+=') is: '
      chain_of_thought+= str(sample['Answer'][0]) +"."

    prompts_by_category[category] = chain_of_thought
  return prompts_by_category

chain_of_thought_prompt_dict = make_chain_of_thought_prompts()

In [94]:
def make_self_ask_prompts():
  prompts_by_category = dict()
  for category in samples_by_category.keys():
    self_ask = ''
    for idx,sample in enumerate(samples_by_category[category]):
      if idx != 0:
        self_ask += '\n\n'

      self_ask += 'Question: '
      self_ask += sample['Question']
      self_ask += '\nAre follow up questions needed here: Yes.\nFollow up: '
      self_ask += sample['Q1'] + '\nIntermediate answer: '
      self_ask += str(sample['A1'][0]) + '.\nFollow up: '
      self_ask += sample['Q2'] + '\nIntermediate answer: '
      self_ask += str(sample['A2'][0]) + '.\nSo the final answer is: '
      self_ask += str(sample['Answer'][0]) +"."
    prompts_by_category[category] = self_ask
  return prompts_by_category

self_ask_prompt_dict = make_self_ask_prompts()

In [95]:
def make_direct_answer_prompts():
  prompts_by_category = dict()
  for category in samples_by_category.keys():
    direct_answer = ''
    for idx,sample in enumerate(samples_by_category[category]):
      if idx != 0:
        direct_answer += '\n\n'

      direct_answer += 'Question: '
      direct_answer += sample['Question']
      direct_answer += '\nAnswer: '
      direct_answer += str(sample['Answer'][0])
    prompts_by_category[category] = direct_answer
  return prompts_by_category

direct_answer_prompt_dict = make_direct_answer_prompts()

In [96]:
def make_subquestion_prompt(which_question):
  prompts_by_category = dict()
  for category in samples_by_category.keys():
    subquestion_prompt = ''
    for idx,sample in enumerate(samples_by_category[category]):
      if idx != 0:
        subquestion_prompt += '\n\n'

      subquestion_prompt += 'Question: '
      subquestion_prompt += sample[f'Q{which_question}']
      subquestion_prompt += "\nAnswer: "
      subquestion_prompt += str(sample[f'A{which_question}'][0])
    prompts_by_category[category] = subquestion_prompt
  return prompts_by_category

subquestion_1_prompt_dict = make_subquestion_prompt(1)
subquestion_2_prompt_dict = make_subquestion_prompt(2)

In [98]:
# Combine the dictionaries into one main dictionary
all_prompts = {
    "chain_of_thought_prompt_dict": chain_of_thought_prompt_dict,
    "self_ask_prompt_dict": self_ask_prompt_dict,
    "direct_answer_prompt_dict": direct_answer_prompt_dict,
    "subquestion_1_prompt_dict": subquestion_1_prompt_dict,
    "subquestion_2_prompt_dict": subquestion_2_prompt_dict,
}


# Save the combined dictionary to a JSON file in the datasets folder
with open("data/prompts.json", "w") as json_file:
    json.dump(all_prompts, json_file, ensure_ascii=False, indent=4)
