In [12]:
# Mount Google Drive
import os
os.chdir("/content/")

from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U transformers==3.0.0
!python -m nltk.downloader punkt
!git clone https://github.com/patil-suraj/question_generation.git

In [9]:
torch.cuda.is_available()

True

In [13]:
# Generate Data with MEDQA-Summarization Dataset


import json
import h5py
import numpy as np
import string
import re
import nltk
import random
import torch
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from question_generation.pipelines import pipeline


def generate_dataset(gen_pipeline: pipeline, data: json) -> None:

    # Initialize Data Dictionary
    dataset = {}
    summary_dataset = {}
    dataKeys = data.keys()
    dataset['questions'], dataset['answers'], dataset['dialogs'] = [], [], []

    summary_id = 0

    print('Parsing dataset initiated ...')
    for key in dataKeys:

        # Store questions' and answers' last ids
        questions_id_counter = len(dataset['questions'])
        answers_id_counter = len(dataset['answers'])

        # Append Question
        dataset['questions'].append(data[key]['question'])

        # Append Answers
        answer_keys = list(data[key]['answers'].keys())

        for answer_key in answer_keys:
            dataset['answers'].append(
                data[key]['answers'][answer_key]['answer_ext_summ'])

        # Generate Dialogs from dataset
        current_dialog = {
            'summary': '',
            'document': '',
            'dialog': [],
        }

        # Set summary and save it to summaries' dataset
        summary_dataset[str(summary_id)] = data[key]['multi_ext_summ']
        current_dialog['summary'] = str(summary_id)
        summary_id += 1

        # Set the first article as the dialog's document
        current_dialog['document'] = data[key]['answers'][answer_keys[0]]['article']

        # Generate Dialogues from the standard dataset
        dialogue = {
            'question': questions_id_counter,
            'answer': answers_id_counter,
            'answer_options': [i for i in range(answers_id_counter, len(dataset['answers']))],
            'gt_index': 0,
        }
        dialogue['gt_index'] = 0

        current_dialog['dialog'].append(dialogue)

        # Generate more questions and answers with transformer pipeline
        for answer_key in answer_keys:
            try:
              # Generate questions and answers for each article in the answers
              generated_qas = gen_pipeline(data[key]['answers'][answer_key]['article'])
            except:
                continue

            # Append Results in the generated dataset and Create new dialogues
            prev_answer_id_counter = answers_id_counter
            for qa in generated_qas:

                questions_id_counter = len(dataset['questions'])
                answers_id_counter = len(dataset['answers'])

                dataset['questions'].append(qa['question'])
                dataset['answers'].append(qa['answer'])

                # Generate Dialogues from generated question-answers
              
                dialogue = {
                    'question': len(dataset['questions']) - 1,
                    'answer': len(dataset['answers']) - 1,
                    'answer_options': [i for i in range(prev_answer_id_counter, len(dataset['answers']))],
                    'gt_index': 0,
                }
                dialogue['gt_index'] = len(dialogue['answer_options']) - 1

                current_dialog['dialog'].append(dialogue)

        dataset['dialogs'].append(current_dialog)

    print('Ready to save dataset ...')
    dataset_to_create = {
        'data': dataset
    }

    with open('drive/MyDrive/gen_dataset.json', 'w') as jsonFile:
        jsonFile.write(json.dumps(dataset_to_create, indent=4))

    with open('drive/MyDrive/summary_dataset.json', 'w') as jsonFile:
        jsonFile.write(json.dumps(summary_dataset, indent=4))
      

    print('Created generated dataset ...')


def main():
    # Question Driven Answer Summarization Primary Dataset path
    mediqa_ans_summ_dataset_path = 'drive/MyDrive/question_driven_answer_summarization_primary_dataset.json'

    # Load Pipeline for QA Generation
    print('Loading pipeline ...')
    qa_gen_pipeline = pipeline(
        'multitask-qa-qg', model="valhalla/t5-base-qa-qg-hl")

    # Generate Dataset with VisDial-like structure
    print('Loading dataset ...')
    jsonData = json.load(open(mediqa_ans_summ_dataset_path))

    generate_dataset(qa_gen_pipeline, jsonData)

    pass


if __name__ == '__main__':
    main()


Loading pipeline ...
Loading dataset ...
Parsing dataset initiated ...
Ready to save dataset ...
Created generated dataset ...


In [None]:
# Generate Data with part of XSUM


import json
import h5py
import numpy as np
import string
import re
import nltk
import random
import torch
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from question_generation.pipelines import pipeline


def generate_xsum(xsum, gen_pipeline):
    dataset = {}
    summary_dataset = {}
    dataset['questions'], dataset['answers'], dataset['dialogs'] = [], [], []
    summary_id = 0
    print('Parsing dataset initiated ...')
    
    for i in range(0, 20332):
        if i % 100 == 0:
            dataset_to_create = {
                'data': dataset
            }

            with open('drive/MyDrive/xgen_dataset.json', 'w') as jsonFile:
                jsonFile.write(json.dumps(dataset_to_create, indent=4))

            with open('drive/MyDrive/xsummary_dataset.json', 'w') as jsonFile:
                jsonFile.write(json.dumps(summary_dataset, indent=4))

        # Generate Dialogues from the standard dataset
        pair = xsum['train'][i]

        try:
            generated_qas = gen_pipeline(pair['document'])
        except:
            continue
        
        # Store questions' and answers' last ids
        questions_id_counter = len(dataset['questions'])
        answers_id_counter = len(dataset['answers'])
        
        # Generate Dialogs from dataset
        current_dialog = {
            'summary': '',
            'document': '',
            'dialog': [],
        }

        # Set summary and save it to summaries' dataset
        summary_dataset[str(summary_id)] = pair['summary']
        current_dialog['summary'] = str(summary_id)
        summary_id += 1
        
        # Set document
        current_dialog['document'] = pair['document']
        
        prev_answer_id_counter = answers_id_counter
        for qa in generated_qas:

            questions_id_counter = len(dataset['questions'])
            answers_id_counter = len(dataset['answers'])

            dataset['questions'].append(qa['question'])
            dataset['answers'].append(qa['answer'])

            # Generate Dialogues from generated question-answers
            
            dialogue = {
                'question': len(dataset['questions']) - 1,
                'answer': len(dataset['answers']) - 1,
                'answer_options': [i for i in range(prev_answer_id_counter, len(dataset['answers']))],
                'gt_index': 0,
            }
            dialogue['gt_index'] = len(dialogue['answer_options']) - 1

            current_dialog['dialog'].append(dialogue)
        
        dataset['dialogs'].append(current_dialog)
        
    print('Ready to save dataset ...')
    dataset_to_create = {
        'data': dataset
    }

    with open('drive/MyDrive/xgen_dataset.json', 'w') as jsonFile:
        jsonFile.write(json.dumps(dataset_to_create, indent=4))

    with open('drive/MyDrive/xsummary_dataset.json', 'w') as jsonFile:
        jsonFile.write(json.dumps(summary_dataset, indent=4))
      

    print('Created generated dataset ...')
    
    
    
def main():
    # XSUM Dataset 
    print('Loading dataset ...')
    from datasets import load_dataset
    dataset = load_dataset("xsum")

    # Load Pipeline for QA Generation
    print('Loading pipeline ...')
    qa_gen_pipeline = pipeline(
        'multitask-qa-qg', model="valhalla/t5-base-qa-qg-hl")

    # Generate Dataset with VisDial-like structure
    generate_xsum(dataset, qa_gen_pipeline)


if __name__ == '__main__':
    main()

In [None]:
# Ensure Answer-Options Lengths
import json 
import random 

xsum = json.load(open('drive/MyDrive/xgen_dataset.json'))['data']
medqa = json.load(open('drive/MyDrive/gen_dataset.json'))['data']

for dialogue in medqa['dialogs']:
    for dialog in dialogue['dialog']:
        while len(dialog['answer_options']) < 100:
            dialog['answer_options'].append(random.randrange(0, len(medqa['answers'])))

with open('drive/MyDrive/gen_dataset_new.json', 'w') as jsonFile:
    jsonFile.write(json.dumps({'data': medqa}, indent=4))

print('medqa finished')

for dialogue in xsum['dialogs']:
    for dialog in dialogue['dialog']:
        while len(dialog['answer_options']) < 100:
            dialog['answer_options'].append(random.randrange(0, len(xsum['answers'])))

with open('drive/MyDrive/xgen_dataset_new.json', 'w') as jsonFile:
    jsonFile.write(json.dumps({'data': xsum}, indent=4))

print('xsum finished')