In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Load the data from the policyqa folder
dataset = datasets.load_dataset('json', data_files={'train': 'policyqa/train.json', 'dev': 'policyqa/dev.json', 'test': 'policyqa/test.json'}, field='data') 


Using custom data configuration default-0c63ba6e2e28f3b2
Reusing dataset json (/home/liam/.cache/huggingface/datasets/json/default-0c63ba6e2e28f3b2/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)
100%|██████████| 3/3 [00:00<00:00, 445.68it/s]


In [18]:
# print the keys of the training dataset
print(dataset['test'][0]['paragraphs'][10:11][0])
print(dataset['test'][0].keys())

{'context': 'create and deliver personalized promotions, including by combining your Personal Information with Other Information, such as the amounts and types of bookings or itineraries you make and discounts or benefits you use', 'index': 11, 'qas': [{'answers': [{'answer_start': 67, 'text': 'your Personal Information'}], 'id': 'wo1uc0blyt1k5tsx', 'question': 'What type of information about me does the website collect?', 'type': 'First Party Collection/Use|||Personal Information Type|||Generic personal information'}, {'answers': [{'answer_start': 0, 'text': 'create and deliver personalized promotions'}], 'id': 'vycw7xem04dd090w', 'question': 'For what purpose do you use my data?', 'type': 'First Party Collection/Use|||Purpose|||Basic service/feature'}, {'answers': [{'answer_start': 57, 'text': 'combining your Personal Information with Other Information'}], 'id': '78981ujzwheh9wvj', 'question': 'Does the company collect my personal information?', 'type': 'First Party Collection/Use|||

In [31]:
def make_dataset_use_entire_document_as_context(row):
    """
    Convert the paragraphs into documents by:
    1. Iteratre over each paragraph in the document (supplied row)
    2. Concatenate all of the paragraphs' contexts together to form a single document
    3. Update the 'context' field of each paragraph to be the document
    4. Update the start of each answer to be the start of the answer in the document
    :param row: a row from the dataset representing a single document
    :return: the updated dataset.
    """
    
    # join all of the contexts together to form a single document, separating each with a newline
    entire_document = '\n'.join([paragraph['context'] for paragraph in row['paragraphs']])

    # the offset relative to the start of the document
    context_offset = 0 

    for paragraph in row['paragraphs']:
        original_context = paragraph['context']
        paragraph['context'] = entire_document
        for qa in paragraph['qas']:
            for answer in qa['answers']:
                answer['answer_start'] += context_offset

        # update the context offset for the next paragraph
        context_offset += len(original_context) + 1 # +1 for the newline character

    return row


def check_answer_offsets(row):
    """
    Check that the answer offsets are correct
    :param row: a row from the dataset representing a single document
    :return: Nothing
    """
    for paragraph in row['paragraphs']:
        for qa in paragraph['qas']:
            for answer in qa['answers']:
                start = answer['answer_start']
                selected_from_context = paragraph['context'][start:start + len(answer['text'])]
                actual_text = answer['text']
                assert selected_from_context == actual_text, f"Expected {actual_text} but got {selected_from_context}"


In [36]:
long_dataset = dataset.map(make_dataset_use_entire_document_as_context)

long_dataset.map(check_answer_offsets)


Loading cached processed dataset at /home/liam/.cache/huggingface/datasets/json/default-0c63ba6e2e28f3b2/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-be0d73e610aebd79.arrow
Loading cached processed dataset at /home/liam/.cache/huggingface/datasets/json/default-0c63ba6e2e28f3b2/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-dec5f827c7192b57.arrow
Loading cached processed dataset at /home/liam/.cache/huggingface/datasets/json/default-0c63ba6e2e28f3b2/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253/cache-6ef676e5dcca3aea.arrow
100%|██████████| 75/75 [00:01<00:00, 66.61ex/s]
100%|██████████| 20/20 [00:00<00:00, 127.92ex/s]
100%|██████████| 20/20 [00:00<00:00, 77.21ex/s]


DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 75
    })
    dev: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 20
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 20
    })
})