In [1]:
import os
os.chdir("/content/")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install -U transformers==3.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3.0.0
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[K     |████████████████████████████████| 754 kB 24.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 59.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 72.2 MB/s 
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp38-cp38-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 67.9 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=03cb023f79a3f4ee9fcc80de73d60355e70c0aa634873d801f976

In [5]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
torch.cuda.is_available()

True

In [6]:
!git clone https://github.com/patil-suraj/question_generation.git

Cloning into 'question_generation'...
remote: Enumerating objects: 268, done.[K
remote: Total 268 (delta 0), reused 0 (delta 0), pack-reused 268[K
Receiving objects: 100% (268/268), 299.04 KiB | 8.54 MiB/s, done.
Resolving deltas: 100% (140/140), done.


In [11]:
import json
import h5py
import numpy as np
import string
import re
import nltk
import random
import torch
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from question_generation.pipelines import pipeline


def generate_dataset(gen_pipeline: pipeline, data: json) -> None:

    # Initialize Data Dictionary
    dataset = {}
    summary_dataset = {}
    dataKeys = data.keys()
    dataset['questions'], dataset['answers'], dataset['dialogs'] = [], [], []

    summary_id = 0

    print('Parsing dataset initiated ...')
    for key in dataKeys:

        # Store questions' and answers' last ids
        questions_id_counter = len(dataset['questions'])
        answers_id_counter = len(dataset['answers'])

        # Append Question
        dataset['questions'].append(data[key]['question'])

        # Append Answers
        answer_keys = list(data[key]['answers'].keys())

        for answer_key in answer_keys:
            dataset['answers'].append(
                data[key]['answers'][answer_key]['answer_ext_summ'])

        # Generate Dialogs from dataset
        current_dialog = {
            'summary': '',
            'document': '',
            'dialog': [],
        }

        # Set summary and save it to summaries' dataset
        summary_dataset[str(summary_id)] = data[key]['multi_ext_summ']
        current_dialog['summary'] = str(summary_id)
        summary_id += 1

        # Set the first article as the dialog's document
        current_dialog['document'] = data[key]['answers'][answer_keys[0]]['article']

        # Generate Dialogues from the standard dataset
        dialogue = {
            'question': str(questions_id_counter),
            'answer': str(answers_id_counter),
            'answer_options': [i for i in range(answers_id_counter, len(dataset['answers']))],
            'gt_index': '',
        }
        dialogue['gt_index'] = '0'

        current_dialog['dialog'].append(dialogue)

        # Generate more questions and answers with transformer pipeline
        for answer_key in answer_keys:
            try:
              # Generate questions and answers for each article in the answers
              generated_qas = gen_pipeline(data[key]['answers'][answer_key]['article'])
            except:
                continue

            # Append Results in the generated dataset and Create new dialogues
            prev_answer_id_counter = answers_id_counter
            for qa in generated_qas:

                questions_id_counter = len(dataset['questions'])
                answers_id_counter = len(dataset['answers'])

                dataset['questions'].append(qa['question'])
                dataset['answers'].append(qa['answer'])

                # Generate Dialogues from generated question-answers
              
                dialogue = {
                    'question': str(len(dataset['questions']) - 1),
                    'answer': str(len(dataset['answers']) - 1),
                    'answer_options': [i for i in range(prev_answer_id_counter, len(dataset['answers']))],
                    'gt_index': '',
                }
                dialogue['gt_index'] = str(len(dialogue['answer_options']) - 1)

                current_dialog['dialog'].append(dialogue)

        dataset['dialogs'].append(current_dialog)

    print('Ready to save dataset ...')
    dataset_to_create = {
        'data': dataset
    }

    with open('drive/MyDrive/gen_dataset.json', 'w') as jsonFile:
        jsonFile.write(json.dumps(dataset_to_create, indent=4))

    with open('drive/MyDrive/summary_dataset.json', 'w') as jsonFile:
        jsonFile.write(json.dumps(summary_dataset, indent=4))
      

    print('Created generated dataset ...')


def main():
    # Question Driven Answer Summarization Primary Dataset path
    mediqa_ans_summ_dataset_path = 'drive/MyDrive/question_driven_answer_summarization_primary_dataset.json'

    # Load Pipeline for QA Generation
    print('Loading pipeline ...')
    qa_gen_pipeline = pipeline(
        'multitask-qa-qg', model="valhalla/t5-base-qa-qg-hl")

    # Generate Dataset with VisDial-like structure
    print('Loading dataset ...')
    jsonData = json.load(open(mediqa_ans_summ_dataset_path))

    generate_dataset(qa_gen_pipeline, jsonData)

    pass


if __name__ == '__main__':
    main()


Loading pipeline ...
Loading dataset ...
Parsing dataset initiated ...
Ready to save dataset ...
Created generated dataset ...
