In [1]:
## prepare data
import datasets
import json,os,random

In [2]:
templates_for_qa = [
    "Question: {question}?\nAnswer:",
    "{question}?",
    "Answer the following question:\n\n{question}",
    "Answer this question:\n\n{question}?",
    "Please answer this question: {question}",
    "Answer the question...{question}?",
    "What is the answer to this question? {question}\n\n",
    "Can you tell me the answer to {question}?",
    "Next question: {question}\n\n",
    "Q: {question} A:",
    "{question}\nWhat is the answer?",
    "Write the answer: {question}",
    "{question}???",
]

templates_for_sum = [
    "Write a short summary for the text\n\nSummary:",
    "Briefly summarize this article:\nSummary:", 
    "What is a shorter version of this:\n\nSummary:",
    "Write a brief summary in a sentence or less.", 
    "What is a very short summary of the above text?",
    "Summarize the aforementioned text in a single phrase.",
    "Can you generate a short summary of the above paragraph?",
    "Summarize the above articles\n\ntl;dr:",
]
template_for_fact_checking = [
    "Verify the following claims with \"True\" or \"False\":\n{question}",
]

In [3]:
total_data = []

In [4]:
## commonsense_qa
data = datasets.load_dataset("commonsense_qa")
print(len(data['train']))
for idx,sample in enumerate(data['train']):
    question = sample['question']+'\n\n'
    for choice,text in zip(sample['choices']['label'],sample['choices']['text']):
        question += choice + ". " + text + '\n'

    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample['answerKey']

    messages = [
        {"role":"user","content":question},
        {"role":"assistant","content":answer},
    ]

    total_data.append(
        {
            "id":f"commonsense_qa_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )

9741


In [5]:
## webqa
data = datasets.load_dataset('web_questions')
print(len(data['train']))
for idx,sample in enumerate(data['train']):
    question = sample['question']+'\n'
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample['answers'][0]

    messages = [
        {"role":"user","content":question},
        {"role":"assistant","content":answer},
    ]

    total_data.append(
        {
            "id":f"web_questions_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )
print(total_data[-1])

3778
{'id': 'web_questions_3777', 'messages': [{'role': 'user', 'content': 'what kind government does the us have?\n?'}, {'role': 'assistant', 'content': 'Presidential system'}], 'task_type': 'open_qa'}


In [6]:
## wikiqa
data = datasets.load_dataset('wiki_qa')
print(len(data['train']))
for idx,sample in enumerate(data['train']):
    if sample['label'] == 0: continue
    question = sample['question'] + '\n'
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample['answer']

    messages = [
        {"role":"user","content":question},
        {"role":"assistant","content":answer},
    ]

    total_data.append(
        {
            "id":f"wiki_qa_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )
print(total_data[-1])

20360
{'id': 'wiki_qa_20349', 'messages': [{'role': 'user', 'content': 'Answer this question:\n\nwhat is section eight housing\n?'}, {'role': 'assistant', 'content': 'It operates through several programs, the largest of which, the Housing Choice Voucher program, pays a large portion of the rents and utilities of about 2.1 million households.'}], 'task_type': 'open_qa'}


In [7]:
## yahoo_qa
data = datasets.load_dataset('yahoo_answers_qa')
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    question = sample['question'] + '\n'
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample['answer']

    messages = [
        {"role":"user","content":question},
        {"role":"assistant","content":answer},
    ]

    total_data.append(
        {
            "id":f"yahoo_answers_qa_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )
print(total_data[-1])

87362
{'id': '2020338', 'question': 'Why did the U.S Invade Iraq ?', 'answer': "A small group of politicians believed strongly that the fact that Saddam Hussien remained in power after the first Gulf War was a signal of weakness to the rest of the world, one that invited attacks and terrorism. Shortly after taking power with George Bush in 2000 and after the attack on 9/11, they were able to use the terrorist attacks to justify war with Iraq on this basis and exaggerated threats of the development of weapons of mass destruction. The military strength of the U.S. and the brutality of Saddam's regime led them to imagine that the military and political victory would be relatively easy.", 'nbestanswers': ["A small group of politicians believed strongly that the fact that Saddam Hussien remained in power after the first Gulf War was a signal of weakness to the rest of the world, one that invited attacks and terrorism. Shortly after taking power with George Bush in 2000 and after the attack 

In [8]:
## freebase_qa
data = datasets.load_dataset('freebase_qa')
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    question = sample['RawQuestion'] + '\n'
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample["Parses"]['Answers'][0]['AnswersName'][0][0]

    messages = [
        {"role":"user","content":question},
        {"role":"assistant","content":answer},
    ]

    total_data.append(
        {
            "id":f"freebase_qa_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )
print(total_data[-1])

20358
{'Question-ID': 'FreebaseQA-train-0', 'RawQuestion': "What was Pierce Brosnan's first outing as 007?", 'ProcessedQuestion': "what was pierce brosnan's first outing as 007", 'Parses': {'Parse-Id': ['FreebaseQA-train-0.P0', 'FreebaseQA-train-0.P1'], 'PotentialTopicEntityMention': ['007', 'pierce brosnan'], 'TopicEntityName': ['james bond', 'pierce brosnan'], 'TopicEntityMid': ['m.0clpml', 'm.018p4y'], 'InferentialChain': ['film.film_character.portrayed_in_films..film.performance.film', 'film.actor.film..film.performance.film'], 'Answers': [{'AnswersMid': ['m.01npcx'], 'AnswersName': [['goldeneye']]}, {'AnswersMid': ['m.01npcx'], 'AnswersName': [['goldeneye']]}]}}
{'id': 'freebase_qa_20357', 'messages': [{'role': 'user', 'content': 'Answer the question...Zydeco is a type of music from which country?\n?'}, {'role': 'assistant', 'content': 'united states'}], 'task_type': 'open_qa'}


In [9]:
## ms_marco
data = datasets.load_dataset('ms_marco',"v2.1")
data = list(data['train'])
# print(len(data['train']))
# print(data['train'][0])
for idx,sample in enumerate(data[:100_000]):

    question = sample['query'].lstrip(")") + '\n'
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample["answers"][0]

    messages = [
        {"role":"user","content":question},
        {"role":"assistant","content":answer},
    ]

    total_data.append(
        {
            "id":f"ms_marco_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )
print(total_data[-1])

{'id': 'ms_marco_99999', 'messages': [{'role': 'user', 'content': 'Answer the following question:\n\nis carbonic acid soluble\n'}, {'role': 'assistant', 'content': 'Yes'}], 'task_type': 'open_qa'}


In [10]:
## coqa
data = datasets.load_dataset("coqa")
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    messages = []
    assert len(sample['answers']['input_text']) == len(sample['questions'])
    for idx,(q,a) in enumerate(zip(sample['questions'],sample['answers']['input_text'])):

        question = q + '\n'
        if idx == 0:
            question = random.choice(templates_for_qa).format_map(dict(question=question))
        answer = a

        messages.append({"role":"user","content":question})
        messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"coqa_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":sample['story'],
        }
    )
print(total_data[-1])

7199
{'source': 'wikipedia', 'story': 'The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula. \n\nThe Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail. \n\nIn March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online. \n\nThe Vatican Secret Archives were separated 

In [11]:
## drop
data = datasets.load_dataset("drop")
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    messages = []
    question = sample['question'] + '\n'
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    answer = sample["answers_spans"]['spans'][0]

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"drop_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":sample['passage'],
        }
    )
print(total_data[-1])

77400
{'section_id': 'nfl_2201', 'query_id': 'f16c0ee7-f131-4a8b-a6ac-4d275ea68066', 'passage': "To start the season, the Lions traveled south to Tampa, Florida to take on the Tampa Bay Buccaneers. The Lions scored first in the first quarter with a 23-yard field goal by Jason Hanson. The Buccaneers tied it up with a 38-yard field goal by Connor Barth, then took the lead when Aqib Talib intercepted a pass from Matthew Stafford and ran it in 28 yards. The Lions responded with a 28-yard field goal. In the second quarter, Detroit took the lead with a 36-yard touchdown catch by Calvin Johnson, and later added more points when Tony Scheffler caught an 11-yard TD pass. Tampa Bay responded with a 31-yard field goal just before halftime. The second half was relatively quiet, with each team only scoring one touchdown. First, Detroit's Calvin Johnson caught a 1-yard pass in the third quarter. The game's final points came when Mike Williams of Tampa Bay caught a 5-yard pass.  The Lions won their r

In [12]:
## narrativeqa
data = datasets.load_dataset("narrativeqa")
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    messages = []
    question = sample['question']['text']
    answer = sample["answers"][0]['text']
    question = random.choice(templates_for_qa).format_map(dict(question=question))

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"narrativeqa_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":sample['document']['summary']['text'],
        }
    )
print(total_data[-1])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

32747
{'id': 'narrativeqa_32746', 'messages': [{'role': 'user', 'content': 'Answer the following question:\n\nHow does Brad get promoted to manager at Mi-T-Mart?'}, {'role': 'assistant', 'content': 'He stops a robbery'}], 'task_type': 'close_qa', 'background': ' Brad Hamilton (Judge Reinhold) is a popular senior who is looking forward to his last year of school and almost has his 1960 Buick LeSabre paid off. He has a part-time job at All-American Burger, a fast food joint where his girlfriend, Lisa, also works. This esteemed establishment has a strict policy of etiquette: "No Shirt, No Shoes, No Dice." Brad is subsequently fired for losing his temper at an obnoxious customer. When Brad tries to tell Lisa how much he needs her, she says she is breaking up with him to see other guys. Brad quits his job at Captain Hook Fish & Chips because of the humiliation of having to wear a pirate costume when delivering food. He later gets a job at Mi-T-Mart, where he successfully thwarts an attempte

In [13]:
## pubmed_qa
data = datasets.load_dataset("pubmed_qa","pqa_labeled")
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    messages = []
    question = sample['question']
    answer = sample['long_answer'] + "So the final answer is: " + sample["final_decision"]
    question = random.choice(templates_for_qa).format_map(dict(question=question))

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"pubmed_qa_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":"\n".join(sample['context']['contexts']),
        }
    )
print(total_data[-1])

1000
{'pubid': 21645374, 'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?', 'context': {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), ce

In [14]:
## quail
data = datasets.load_dataset("quail")
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    messages = []
    question = sample['question'] + '\n'
    for answer_id,answer in enumerate(sample['answers']):
        question += ["A. ","B. ","C. ","D. "][answer_id]+answer+'\n'
    answer = ["A","B","C","D"][sample["correct_answer_id"]]
    question = random.choice(templates_for_qa).format_map(dict(question=question))

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"quail_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":sample['context'],
        }
    )
total_data[-1]

10246
{'id': 'f001_0', 'context_id': 'f001', 'question_id': '0', 'domain': 'fiction', 'metadata': {'author': 'Joseph Devon', 'title': 'Black Eyed Susan', 'url': 'http://manybooks.net/pages/devonjother08black_eyed_susan/0.html'}, 'context': "That fall came and I went back to Michigan and the school year went by and summer came and I never really thought about it. I'm not even sure if I was officially asked, I just wound up heading back to New Jersey when school was out. I think my parents thought it was a good enough deal. They were already having some problems and without Nonna there anymore to take care of me I think my cousin's house on the coast seemed like as good a spot as any to stick me for the summer. It certainly wasn't because of any great love between me and my cousin. We weren't really very good friends at that point. I think she saw me as sort of foisted off on her and getting in the way of her summers. Which was a fair enough judgment. But she could have been nicer. It's 

{'id': 'quail_10245',
 'messages': [{'role': 'user',
   'content': "Answer the question...Why is there most likely support for Pres. Trump's tariffs?\nA. Steel plants will prosper\nB. not enough information\nC. Other businesses will prosper\nD. It will keep American jobs\n?"},
  {'role': 'assistant', 'content': 'B'}],
 'task_type': 'close_qa',
 'background': 'U.S. President Donald Trump’s plan to impose tariffs of 25 percent on steel and 10 percent on aluminum has met criticism from his Republican allies in Congress, many of whom worry the measures could trigger a trade war that damages U.S. businesses.\nBut the president does have supporters among some Senate Democrats from states where voters are concerned about the long-term loss of American manufacturing jobs.\n“This welcome action is long overdue for shuttered steel plants across Ohio and steelworkers who live in fear that their jobs will be the next victims of Chinese cheating,” Senator Sherrod Brown, a Democrat from Ohio, said i

In [15]:
## squad_v2
data = datasets.load_dataset("squad_v2")
print(len(data['train']))
print(data['train'][0])
for idx,sample in enumerate(data['train']):
    messages = []
    question = sample['question']
    answer = sample['answers']['text'][0] if len(sample['answers']['text'])>0 else "I don't know." 
    question = random.choice(templates_for_qa).format_map(dict(question=question))

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"squad_v2_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":sample['context'],
        }
    )
total_data[-1]

130319
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}


{'id': 'squad_v2_130318',
 'messages': [{'role': 'user',
   'content': 'Can you tell me the answer to What field of study has a variety of unusual contexts??'},
  {'role': 'assistant', 'content': "I don't know."}],
 'task_type': 'close_qa',
 'background': 'The term "matter" is used throughout physics in a bewildering variety of contexts: for example, one refers to "condensed matter physics", "elementary matter", "partonic" matter, "dark" matter, "anti"-matter, "strange" matter, and "nuclear" matter. In discussions of matter and antimatter, normal matter has been referred to by Alfvén as koinomatter (Gk. common matter). It is fair to say that in physics, there is no broad consensus as to a general definition of matter, and the term "matter" usually is used in conjunction with a specifying modifier.'}

In [16]:
## cnn_dm
data = datasets.load_dataset("cnn_dailymail",'3.0.0')
data = list(data['train'])
for idx,sample in enumerate(data[:10_0000]):
    messages = []
    answer = sample['highlights']
    question = random.choice(templates_for_sum)

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"cnn_dailymail_{idx}",
            "messages":messages,
            "task_type":"summarization",
            "background":sample['article'],
        }
    )
total_data[-1]

{'id': 'cnn_dailymail_99999',
 'messages': [{'role': 'user',
   'content': 'What is a shorter version of this:\n\nSummary:'},
  {'role': 'assistant',
   'content': 'The Merseyside-based traffickers plotted to flood the streets of Scotland, South Wales, Lancashire and Cheshire with heroin and cocaine .\nPolice seized drugs with a wholesale value of £1 million - including 9.25kg of heroin, 8.25kg of cocaine, and 12kg of amphetamine .'}],
 'task_type': 'summarization',
 'background': "By . Daily Mail Reporter . PUBLISHED: . 15:15 EST, 8 August 2012 . | . UPDATED: . 18:55 EST, 8 August 2012 . Thirteen people involved in a multi-million pound drugs distribution gang have been jailed. The Merseyside-based traffickers plotted to flood the streets of Scotland, South Wales, Lancashire and Cheshire with heroin and cocaine. Police said the 'major players' in the drugs trade were caught following an eight-month covert operation by the North West Regional Organised Crime Unit, Titan. Drug trafficke

In [17]:
## samsum
dataset = datasets.load_dataset("samsum")
print(len(dataset['train']))
print(dataset['train'][0])
# "samsum_6054" empty background
for idx,sample in enumerate(dataset['train']):
    messages = []
    answer = sample['summary']
    question = random.choice(templates_for_sum)

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"samsum_{idx}",
            "messages":messages,
            "task_type":"summarization",
            "background":sample['dialogue'].replace("\r\n",'\n'),
        }
    )
total_data[-1]


14732
{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}


{'id': 'samsum_14731',
 'messages': [{'role': 'user',
   'content': 'Can you generate a short summary of the above paragraph?'},
  {'role': 'assistant',
   'content': "Georgia and Juliette are looking for a hotel in Lisbon. Juliette dislikes Georgia's choices. Juliette and Georgia decide on the second option presented by Georgia, but it has already been booked. Finally Georgia books the third hotel. "}],
 'task_type': 'summarization',
 'background': "Georgia: are you ready for hotel hunting? We need to book something finally for Lisbon\r\nJuliette: sure we can go on, show me what you found\r\nGeorgia: <file_photo>\r\nJuliette: nah... it looks like an old lady's room lol\r\nGeorgia: <file_photo>\r\nJuliette: that's better... but the bed doesn't look very comfortable\r\nGeorgia: i kind of like it and it's really close to the city center\r\nJuliette: show me the others please\r\nGeorgia: <file_photo>\r\nJuliette: nah... this one sucks too, look at those horrible curtains \r\nGeorgia: aff 

In [18]:
## dialogsum
dataset = datasets.load_dataset("knkarthick/dialogsum")
print(len(dataset['train']))
print(dataset['train'][0])
for idx,sample in enumerate(dataset['train']):
    messages = []
    answer = sample['summary']
    question = random.choice(templates_for_sum)

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"dialogsum_{idx}",
            "messages":messages,
            "task_type":"summarization",
            "background":sample['dialogue'],
        }
    )
total_data[-1]

12460
{'id': 'train_0', 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.", 'summary': "Mr. Sm

{'id': 'dialogsum_12459',
 'messages': [{'role': 'user',
   'content': 'Summarize the above articles\n\ntl;dr:'},
  {'role': 'assistant',
   'content': "#Person1# asks for #Person2#'s idea of packing the bag when visiting uncle Lee's family next Saturday."}],
 'task_type': 'summarization',
 'background': "#Person1#: Mom, I'm flying to visit uncle Lee's family next Saturday. Should I pack my bags today?\n#Person2#: Yes, I think so.\n#Person1#: OK. What clothes should I take? I know it's hot there.\n#Person2#: Yes, but it rains a lot. You can borrow an umbrella or a jacket if it's wet. Just pack some T-shirts.\n#Person1#: OK. And who is meeting me at the airport?\n#Person2#: Well, uncle Lee and aunt Wong will be busy, but your cousin Susan can pick you up."}

In [19]:
## pwc
import json
dataset = [json.loads(x) for x in open("data/pwc/PwC_train.jsonl").readlines()]
print(len(dataset))
for idx,sample in enumerate(dataset):
    messages = []
    answer = sample['answer']
    question = sample['prompt']

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"pwc_{idx}",
            "messages":messages,
            "task_type":"close_qa",
            "background":sample['input'],
        }
    )
total_data[-1]

241564


{'id': 'pwc_241563',
 'messages': [{'role': 'user',
   'content': 'Write a paragraph (i.e., continuation) that follows the above text.'},
  {'role': 'assistant',
   'content': "As the situation unfolds, it is crucial for consumers to remain aware of the potential ethical concerns surrounding their favorite brands. With increasing globalization, it is more important than ever to ensure fair labor practices and prevent exploitation. Should the allegations against Ivy Park prove to be unfounded, the brand can continue to promote its empowering message for women worldwide. However, if the allegations hold true, it may be time for a reevaluation of Ivy Park's ethical trading program and a renewed commitment to fair labor practices."}],
 'task_type': 'close_qa',
 'background': 'Tyga is trying to be the new Kanye West, but it isn\'t going to happen\n\nIf you think Jenelle Evans is pregnant with baby No. 3, think again\n\nLeah Prinzivalli writes about pop culture, beauty and health and has int

In [20]:
## nq_open
dataset = datasets.load_dataset("nq_open")
print(len(dataset['train']))
print(dataset['train'][0])
for idx,sample in enumerate(dataset['train']):
    messages = []
    answer = sample['answer'][0]
    question = sample['question']
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    

    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"nq_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )
total_data[-1]

87925
{'question': 'where did they film hot tub time machine', 'answer': ['Fernie Alpine Resort']}


{'id': 'nq_87924',
 'messages': [{'role': 'user',
   'content': "Answer the question...nigeria was given it's name by who?"},
  {'role': 'assistant', 'content': 'Flora Louise Shaw'}],
 'task_type': 'open_qa'}

In [21]:
## fm2
fm2 = [json.loads(x) for x in open("data/eval/fm2/fm2-train.jsonl").readlines()]
print(len(fm2))
for idx,sample in enumerate(fm2):
    question = sample['question']
    messages = [
        {"role":"user","content":template_for_fact_checking[0].format_map(dict(question=question))},
        {"role":"assistant","content":"True" if 'supports' in sample['answer'] else "False"},
    ]
    total_data.append(
        {
            "id":f"fm2_{idx}",
            "task_type":"fact_checking",
            "messages":messages,
        }
    )    

10419


In [22]:
## triviaqa
tqa = [json.loads(x) for x in open("data/eval/triviaqa/tqa-train.jsonl").readlines()]
print(len(tqa))
for idx,sample in enumerate(tqa):
    messages = []
    answer = sample['answer'][0]
    question = sample['question']
    question = random.choice(templates_for_qa).format_map(dict(question=question))
    
    messages.append({"role":"user","content":question})
    messages.append({"role":"assistant","content":answer})

    total_data.append(
        {
            "id":f"triviaqa_{idx}",
            "messages":messages,
            "task_type":"open_qa",
        }
    )

78785
