In [77]:
from datasets import load_dataset

# Load each dataset with the correct configurations
gsm8k = load_dataset('gsm8k', 'main')  # GSM8K confirmed to use 'default' config
csqa = load_dataset('commonsense_qa', 'default')  # CommonsenseQA (CSQA) using 'default'
squad_v1 = load_dataset('squad', 'plain_text')  # SQuAD v1 uses 'plain_text'
squad_v2 = load_dataset('squad_v2', 'squad_v2')  # SQuAD v2 using 'squad_v2'
hotpotqa = load_dataset('hotpot_qa', 'distractor', trust_remote_code=True)  # HotpotQA with 'distractor'

# Print a sample from each dataset
print(gsm8k['train'][0])
print(csqa['train'][0])
print(squad_v1['train'][0])
print(squad_v2['train'][0])
print(hotpotqa['train'][0])



{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

{'id': '075e483d21c29a511267ef62bedc0461', 'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?', 'question_concept': 'punishing', 'choices': {'label': ['A', 'B', 'C', 'D', 'E'], 'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']}, 'answerKey': 'A'}

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "

In [81]:


# Initialize the list to store question-answer pairs
qa_lists = {}

# Function to extract questions and answers from GSM8K
def extract_gsm8k(data):
    return [{'question': item['question'], 'correct_answer': item['answer']} for item in data['train']]

# Function to extract questions and answers from CSQA
def extract_csqa(data, n=10):
    qa = []
    train_data = data['train']  # Access the train dataset directly
    for item in train_data:  # Loop through the items in the train dataset
        question = item['question']
        answer_index = ord(item['answerKey']) - ord('A')  # Convert 'A', 'B', 'C', etc. to 0, 1, 2...

        # Accessing the choices correctly
        choices_labels = item['choices']['label']
        choices_texts = item['choices']['text']
        
        # Creating a dictionary of choices
        choices = {label: text for label, text in zip(choices_labels, choices_texts)}
        
        # Get the correct answer text based on answer index
        answer = choices[item['answerKey']]
        
        # Append the question and the corresponding answer
        qa.append({
            'question': question,
            'choices': choices,
            'correct_answer': answer
        })

        # Limit the number of entries
        if len(qa) >= n:
            break  # Stop if we have collected enough entries
    
    return qa

# Function to extract questions and answers from SQuAD v1
def extract_squad_v1(data):
    qa = []
    for item in data['train']:
        question = item['question']
        # The answer is contained within the 'answers' dictionary
        answer_text = item['answers']['text'][0] if item['answers']['text'] else None
        qa.append({'question': question, 'correct_answer': answer_text})
    return qa

# Function to extract questions and answers from SQuAD v2
def extract_squad_v2(data):
    qa = []
    for item in data['train']:
        question = item['question']
        # The answer is contained within the 'answers' dictionary
        answer_text = item['answers']['text'][0] if item['answers']['text'] else None
        qa.append({'question': question, 'correct_answer': answer_text})
    return qa

# Function to extract questions and answers from HotpotQA
def extract_hotpotqa(data):
    return [{'question': item['question'], 'correct_answer': item['answer']} for item in data['train']]

# Extract questions and answers from each dataset
qa_lists['GSM8K'] = extract_gsm8k(gsm8k)
qa_lists['CSQA'] = extract_csqa(csqa, n=5)
qa_lists['SQuAD_v1'] = extract_squad_v1(squad_v1)
qa_lists['SQuAD_v2'] = extract_squad_v2(squad_v2)
qa_lists['HotpotQA'] = extract_hotpotqa(hotpotqa)


In [85]:
n = 2  # Specify how many entries to print
# Print the extracted question-answer pairs
for dataset, qa in qa_lists.items():
    print(f"Dataset: {dataset}")
    for entry in qa[:n]:  # Limit printing to n entries for readability
        if isinstance(entry, tuple):
            # For datasets returning tuples (e.g., GSM8K, SQuAD)
            q, a = entry
            print(f"Q: {q}\nA: {a}\n")
        else:
            # For datasets returning dictionaries (e.g., CSQA)
            print(f"Q: {entry['question']}\nA: {entry['correct_answer']}\n")

# Write the output to a file
with open('qa_output.txt', 'w') as f:
    for dataset, qa in qa_lists.items():
        f.write(f"Dataset: {dataset}\n")
        for entry in qa[:n]:  # Limit to n entries for brevity
            if isinstance(entry, tuple):
                # For datasets returning tuples
                q, a = entry
                f.write(f"Q: {q}\nA: {a}\n\n")
            else:
                # For datasets returning dictionaries
                f.write(f"Q: {entry['question']}\nA: {entry['correct_answer']}\n\n")



Dataset: GSM8K

Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

A: Natalia sold 48/2 = <<48/2=24>>24 clips in May.

Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.

#### 72



Q: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

A: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.

Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.

#### 10



Dataset: CSQA

Q: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?

A: ignore



Q: Sammy wanted to go to where the people were.  Where might he go?

A: populated areas



Dataset: SQuAD_v1

Q: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?

A: Saint Bernadette Soubirous



Q: What is in front of the Notre Dame Main Building?

A: a copper statue

In [83]:
import pandas as pd

# Combine all questions and answers into a DataFrame
all_qa = []
for dataset, qa in qa_lists.items():
    for entry in qa[:n]:  # Limit to n pairs for readability
        if isinstance(entry, tuple):
            # For datasets returning tuples (e.g., GSM8K, SQuAD)
            q, a = entry
        else:
            # For datasets returning dictionaries (e.g., CSQA)
            q = entry['question']
            a = entry['correct_answer']
        all_qa.append((dataset, q, a))

df = pd.DataFrame(all_qa, columns=['Dataset', 'Question', 'Answer'])
print(df)



     Dataset                                           Question  \

0      GSM8K  Natalia sold clips to 48 of her friends in Apr...   

1      GSM8K  Weng earns $12 an hour for babysitting. Yester...   

2      GSM8K  Betty is saving money for a new wallet which c...   

3      GSM8K  Julie is reading a 120-page book. Yesterday, s...   

4      GSM8K  James writes a 3-page letter to 2 different fr...   

5       CSQA  The sanctions against the school were a punish...   

6       CSQA  Sammy wanted to go to where the people were.  ...   

7       CSQA  To locate a choker not located in a jewelry bo...   

8       CSQA  Google Maps and other highway and street GPS s...   

9       CSQA  The fox walked from the city into the forest, ...   

10  SQuAD_v1  To whom did the Virgin Mary allegedly appear i...   

11  SQuAD_v1  What is in front of the Notre Dame Main Building?   

12  SQuAD_v1  The Basilica of the Sacred heart at Notre Dame...   

13  SQuAD_v1                  What is the Grotto

In [46]:
for dataset, qa in qa_lists.items():
    print(f"Dataset: {dataset} - {len(qa)} entries")
    for q, a in qa[:5]:  # Change 5 to however many you want to display
        print(f"Q: {q}\nA: {a}\n")


Dataset: GSM8K - 7473 entries

Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

A: Natalia sold 48/2 = <<48/2=24>>24 clips in May.

Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.

#### 72



Q: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?

A: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.

Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.

#### 10



Q: Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

A: In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.

Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.

This means, Betty needs 100 - 50 - 30 - 15 = $