In [None]:
from datasets import load_dataset

# Load each dataset with the correct configurations
gsm8k = load_dataset('gsm8k', 'main')  # GSM8K confirmed to use 'default' config
csqa = load_dataset('commonsense_qa', 'default')  # CommonsenseQA (CSQA) using 'default'
squad_v1 = load_dataset('squad', 'plain_text')  # SQuAD v1 uses 'plain_text'
squad_v2 = load_dataset('squad_v2', 'squad_v2')  # SQuAD v2 using 'squad_v2'
hotpotqa = load_dataset('hotpot_qa', 'distractor', trust_remote_code=True)  # HotpotQA with 'distractor'

# Print a sample from each dataset
# print(gsm8k['train'][0])
# print(csqa['train'][0])
# print(squad_v1['train'][0])
# print(squad_v2['train'][0])
# print(hotpotqa['train'][0])

In [None]:
# Initialize the list to store question-answer pairs
qa_lists = {}

# Function to extract questions and answers from GSM8K
def extract_gsm8k(data):
    return [{'question': item['question'], 'correct_answer': item['answer']} for item in data['train']]

# Function to extract questions and answers from CSQA
def extract_csqa(data, n=10):
    qa = []
    train_data = data['train']  # Access the train dataset directly
    for item in train_data:  # Loop through the items in the train dataset
        question = item['question']
        answer_index = ord(item['answerKey']) - ord('A')  # Convert 'A', 'B', 'C', etc. to 0, 1, 2...

        # Accessing the choices correctly
        choices_labels = item['choices']['label']
        choices_texts = item['choices']['text']
        
        # Creating a dictionary of choices
        choices = {label: text for label, text in zip(choices_labels, choices_texts)}
        
        # Get the correct answer text based on answer index
        answer = choices[item['answerKey']]
        
        # Append the question and the corresponding answer
        qa.append({
            'question': question,
            'choices': choices,
            'correct_answer': answer
        })

        # Limit the number of entries
        if len(qa) >= n:
            break  # Stop if we have collected enough entries
    
    return qa

# Function to extract questions and answers from SQuAD v1
def extract_squad_v1(data):
    qa = []
    for item in data['train']:
        question = item['question']
        # The answer is contained within the 'answers' dictionary
        answer_text = item['answers']['text'][0] if item['answers']['text'] else None
        qa.append({'question': question, 'correct_answer': answer_text})
    return qa

# Function to extract questions and answers from SQuAD v2
def extract_squad_v2(data):
    qa = []
    for item in data['train']:
        question = item['question']
        # The answer is contained within the 'answers' dictionary
        answer_text = item['answers']['text'][0] if item['answers']['text'] else None
        qa.append({'question': question, 'correct_answer': answer_text})
    return qa

# Function to extract questions and answers from HotpotQA
def extract_hotpotqa(data):
    return [{'question': item['question'], 'correct_answer': item['answer']} for item in data['train']]

# Extract questions and answers from each dataset
qa_lists['GSM8K'] = extract_gsm8k(gsm8k)
qa_lists['CSQA'] = extract_csqa(csqa, n=1000)
qa_lists['SQuAD_v1'] = extract_squad_v1(squad_v1)
qa_lists['SQuAD_v2'] = extract_squad_v2(squad_v2)
qa_lists['HotpotQA'] = extract_hotpotqa(hotpotqa)

In [None]:
def load_datasets():
    return qa_lists

In [None]:
# n = 2  # Specify how many entries to print
# # Print the extracted question-answer pairs
# for dataset, qa in qa_lists.items():
#     print(f"Dataset: {dataset}")
#     for entry in qa[:n]:  # Limit printing to n entries for readability
#         if isinstance(entry, tuple):
#             # For datasets returning tuples (e.g., GSM8K, SQuAD)
#             q, a = entry
#             print(f"Q: {q}\nA: {a}\n")
#         else:
#             # For datasets returning dictionaries (e.g., CSQA)
#             print(f"Q: {entry['question']}\nA: {entry['correct_answer']}\n")

# # Write the output to a file
# with open('qa_output.txt', 'w') as f:
#     for dataset, qa in qa_lists.items():
#         f.write(f"Dataset: {dataset}\n")
#         for entry in qa[:n]:  # Limit to n entries for brevity
#             if isinstance(entry, tuple):
#                 # For datasets returning tuples
#                 q, a = entry
#                 f.write(f"Q: {q}\nA: {a}\n\n")
#             else:
#                 # For datasets returning dictionaries
#                 f.write(f"Q: {entry['question']}\nA: {entry['correct_answer']}\n\n")


In [None]:
# import pandas as pd

# # Combine all questions and answers into a DataFrame
# all_qa = []
# for dataset, qa in qa_lists.items():
#     for entry in qa[:n]:  # Limit to n pairs for readability
#         if isinstance(entry, tuple):
#             # For datasets returning tuples (e.g., GSM8K, SQuAD)
#             q, a = entry
#         else:
#             # For datasets returning dictionaries (e.g., CSQA)
#             q = entry['question']
#             a = entry['correct_answer']
#         all_qa.append((dataset, q, a))

# df = pd.DataFrame(all_qa, columns=['Dataset', 'Question', 'Answer'])
# print(df)



In [None]:
# for dataset, qa in qa_lists.items():
#     print(f"Dataset: {dataset} - {len(qa)} entries")
#     for q, a in qa[:5]:  # Change 5 to however many you want to display
#         print(f"Q: {q}\nA: {a}\n")