# 將 Input Dataset (sentence, distractors and answer) 與 Triplet Dataset 合併

## MCQ

### Prepare Data - Input Dataset

In [1]:
import json
import os, sys
import fnmatch

In [2]:
def read_data(item):
    path = './data/mcq/total_new_cleaned_{}.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [3]:
train = read_data('train')
test = read_data('test')

In [4]:
len(train), len(test)

(2321, 259)

In [5]:
train[0].keys()

dict_keys(['answer', 'distractors', 'sentence'])

### Prepare Data - Triplet Dataset

In [6]:
def read_triplet_data(item):
    path = './data/mcq/triplets/generate_masked_lm/{}.triplet.json'.format(item)
    with open(path) as f:
        data = json.load(f)
    return data

In [7]:
train_triplet = read_triplet_data('train')
test_triplet = read_triplet_data('test')

In [8]:
len(train_triplet), len(test_triplet)

(2321, 259)

Remove Repeat Triplet

In [9]:
for i in range(len(train_triplet)):
    train_triplet[i] = [list(t) for t in set(tuple(element) for element in train_triplet[i] )]
    
for i in range(len(test_triplet)):
    test_triplet[i] = [list(t) for t in set(tuple(element) for element in test_triplet[i] )]

### Merge triplet into input data

In [10]:
def merge_data(data, triplet):
    res = []
    for i in range(len(data)):
        d = data[i]
        sentence = d['sentence']
        distractors = d['distractors']
        answer = d['answer']

        triplets = triplet[i]
        triplets = sorted(triplets, key = lambda x: x[3],reverse=True)

        res.append({'sentence':sentence, 'distractors':distractors, 'answer':answer, 'triplets':triplets})
    return res

In [12]:
train = merge_data(train, train_triplet)
test = merge_data(test, test_triplet)

In [13]:
len(train), len(test)

(2321, 259)

In [14]:
train[0].keys()

dict_keys(['sentence', 'distractors', 'answer', 'triplets'])

### Save File

In [17]:
def save_data(item, data):
    path = './data/mcq/reranker.{}.json'.format(item)
    jsonString = json.dumps(data)
    jsonFile = open(path, "w")
    jsonFile.write(jsonString)
    jsonFile.close()

In [18]:
save_data('train', train)
save_data('test', test)