In [1]:
import pandas as pd 
import json
from tqdm import tqdm

In [2]:
def construct_sections_dataset(dial_filepath, doc_filepath):
    "question_id | question_txt | domain | answer_id | answer_txt | span_id | span_txt | section_id | section_txt | document_id | document_txt "
    sections_dataset = []

    with open(dial_filepath, 'r') as f:
        questions_dataset = json.load(f)['dial_data']

    with open(doc_filepath, 'r') as f:
        doc_dataset = json.load(f)['doc_data']
    
    for domain, domain_dials in questions_dataset.items():
        for dial in tqdm(domain_dials):
            for i, turn in enumerate(dial['turns'][:-1]):
                if turn['role'] == 'user':
                    if dial['turns'][i+1]['role'] == 'agent':
                        agent_turn = dial['turns'][i+1]
                        question = dial['turns'][i]['utterance']
                        answer = dial['turns'][i+1]['utterance']
                        if len(agent_turn['references']):
                            doc_id = agent_turn['references'][0]['doc_id']
                            doc_text = doc_dataset[domain][doc_id]['doc_text']
                            spans_text = ""
                            for ref in agent_turn['references']:
                                span_id = ref['id_sp']
                                spans_text += doc_dataset[domain][doc_id]['spans'][span_id]['text_sp'] + " "
                                section_text = doc_dataset[domain][doc_id]['spans'][span_id]['text_sec']
                        else:
                            print(f"{dial['dial_id']}_{i}".center(50, '='))
                        sections_dataset.append({
                            'question_id': f"{dial['dial_id']}_{i+1}",
                            'question_text': question,
                            'domain': domain,
                            'answer_id': f"{dial['dial_id']}_{i+2}",
                            'utterance': answer,
                            'grounding': spans_text,
                            'section_text': section_text, 
                            'document_id': doc_id,
                            'document_text': doc_text, 
                        })
                    else:
                        continue
    return sections_dataset

In [None]:
train = construct_sections_dataset('multidoc2dial/multidoc2dial_dial_train.json', 'multidoc2dial/multidoc2dial_doc.json')
val = construct_sections_dataset('multidoc2dial/multidoc2dial_dial_validation.json', 'multidoc2dial/multidoc2dial_doc.json')
test = construct_sections_dataset('multidoc2dial/multidoc2dial_dial_test.json', 'multidoc2dial/multidoc2dial_doc.json')
df_train = pd.DataFrame(train)
df_val = pd.DataFrame(val)
df_test = pd.DataFrame(test)



In [4]:
df_train.to_csv('dataset/multidoc2dial_train_set.csv', sep='\t', index=False)
df_val.to_csv('dataset/multidoc2dial_validation_set.csv', sep='\t', index=False)
df_test.to_csv('dataset/multidoc2dial_test_set.csv', sep='\t', index=False)


In [7]:
def generate_traininset(path_in, path_out):
    dataset = []
    df = pd.read_csv(path_in, sep='\t')
    for i in range(len(df)):
        dataset.append({
            'question+grounding': f"{df['question_text'][i]} <sep> {df['grounding'][i]}",
            'utterance': f"{df['utterance'][i]}",
            })
    df_out = pd.DataFrame(dataset)
    df_out.to_csv(path_out, sep='\t', index=False)

In [9]:
generate_traininset('dataset/multidoc2dial_train_set.csv', 'dataset/train_set.csv')
generate_traininset('dataset/multidoc2dial_validation_set.csv', 'dataset/eval_set.csv')