In [1]:
from datasets import load_dataset

# Load dataset configurations
def load_gsm8k():
    return load_dataset('gsm8k', 'main')

def load_csqa():
    return load_dataset('commonsense_qa', 'default')

def load_squad_v1():
    return load_dataset('squad', 'plain_text')

def load_squad_v2():
    return load_dataset('squad_v2', 'squad_v2')

def load_hotpotqa():
    return load_dataset('hotpot_qa', 'distractor', trust_remote_code=True)

In [2]:
# Function to extract questions and answers from GSM8K
def extract_gsm8k(data):
    return [{'question': item['question'], 'correct_answer': item['answer']} for item in data['train']]

# Function to extract questions and answers from CSQA
def extract_csqa(data, n=10):
    qa = []
    train_data = data['train']  # Access the train dataset directly
    for item in train_data:  # Loop through the items in the train dataset
        question = item['question']
        answer_index = ord(item['answerKey']) - ord('A')

        # Accessing the choices correctly
        choices_labels = item['choices']['label']
        choices_texts = item['choices']['text']
        
        
        # Format the question with choices directly using zip
        formatted_question = f"{question}\nOptions:\n" + "\n".join([f"{label}. {text}" for label, text in zip(choices_labels, choices_texts)])
        
        # Get the correct answer text based on answer index
        answer = choices_texts[answer_index]
        
        # Append the question and the corresponding answer
        qa.append({
            'question': formatted_question,
            'correct_answer': answer
        })

        # Limit the number of entries
        if len(qa) >= n:
            break  # Stop if we have collected enough entries
    
    return qa

# Function to extract questions and answers from SQuAD v1
def extract_squad_v1(data):
    qa = []
    for item in data['train']:
        question = item['question']
        # The answer is contained within the 'answers' dictionary
        answer_text = item['answers']['text'][0] if item['answers']['text'] else None
        qa.append({'question': question, 'correct_answer': answer_text})
    return qa

# Function to extract questions and answers from SQuAD v2
def extract_squad_v2(data):
    qa = []
    for item in data['train']:
        question = item['question']
        # The answer is contained within the 'answers' dictionary
        answer_text = item['answers']['text'][0] if item['answers']['text'] else None
        qa.append({'question': question, 'correct_answer': answer_text})
    return qa

# Function to extract questions and answers from HotpotQA
def extract_hotpotqa(data):
    return [{'question': item['question'], 'correct_answer': item['answer']} for item in data['train']]

In [3]:
# Load specific datasets based on input
def load_datasets(dataset_names, n_csqa=1000):
    # Mapping for dataset names to their corresponding loaders and extractors
    dataset_map = {
        'GSM8K': (load_gsm8k, extract_gsm8k),
        'CSQA': (load_csqa, extract_csqa),
        'SQuAD_v1': (load_squad_v1, extract_squad_v1),
        'SQuAD_v2': (load_squad_v2, extract_squad_v2),
        'HotpotQA': (load_hotpotqa, extract_hotpotqa)
    }
    qa_lists = {}
    for name in dataset_names:
        if name in dataset_map:
            # Load dataset and extract questions and answers
            loader, extractor = dataset_map[name]
            data = loader()
            if name == 'CSQA':
                qa_lists[name] = extractor(data, n=n_csqa)  # CSQA allows limiting the number of entries
            else:
                qa_lists[name] = extractor(data)
        else:
            print(f"Dataset {name} is not supported.")
    return qa_lists