# Data preprocessing for [MLQA](https://github.com/facebookresearch/MLQA?tab=readme-ov-file) [dataset](https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip)

In [1]:
import json
import random 

# Check data architecture

## English data

In [2]:
file_path = './data/MLQA/test/test-context-en-question-en.json'

with open(file_path, 'r') as f:
    data_en = json.load(f)

def collect_keys(obj, depth=0, keys_per_level={}):
    if isinstance(obj, dict):
        if depth not in keys_per_level:
            keys_per_level[depth] = set()
        for k, v in obj.items():
            keys_per_level[depth].add(k)
            collect_keys(v, depth + 1, keys_per_level)
    elif isinstance(obj, list):
        for item in obj:
            collect_keys(item, depth, keys_per_level)

    return keys_per_level

keys_per_level = collect_keys(data_en)

for level, keys in keys_per_level.items():
    print(f"Level {level}: {', '.join(keys)}")

Level 0: version, data
Level 1: title, paragraphs
Level 2: qas, context
Level 3: answers, id, question
Level 4: text, answer_start


- version
- data
    - title
    - paragraphs 
        - context 
        - qas
            - id
            - question
            - answers
                - text
                - answer_start

一个paragraphs有多个 context & qas， 一个context也有多个qa

```
{'title': 'Cell culture',
 'paragraphs': 
      [ {'context': 'An established or immortalized of the telomerase gene....',
         'qas': 
          [{
            'question': 'What thing composes the line?',
            'answers': 
              [{
                'text': 'cell', 
                'answer_start': 31
              }],
            'id': '037e8929e7e4d2f949ffbabd10f0f860499ff7c9'
          }]
        },
        {'context': 'The 19th-century English physiologist Sydney Ringer developed salt solutions......',
         'qas': 
          [{
            'question': 'When did Roux remove some of his medullary plate?',
            'answers': 
              [{
                'text': '1885', 
                'answer_start': 232
              }],
            'id': '4b36724f3cbde7c287bde512ff09194cbba7f932'
           },
           {
            'question': 'When were cell culture techniques significantly advanced?',
            'answers': 
              [{
                'text': 'the 1940s and 1950s', 
                'answer_start': 677
              }],
            'id': 'c8acddd587c933917a0a09a214aee83c30764a0d'
          }]
        }
      ]
}
```

In [3]:
all_contexts_en = []
all_qas_en = []

for item in data_en["data"]:
    for paragraph in item["paragraphs"]:
        all_contexts_en.append(paragraph["context"])
        all_qas_en.extend(paragraph["qas"])

In [4]:
all_contexts_en[0], all_qas_en[0]
# print(json.dumps(all_qas_en, indent=2))

('In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation and Recovery Act (which governs handling of dan

In [5]:
print(f"For English data:\n\tNumber of Contexts: {len(all_contexts_en)}")
print(f"\tNumber of QA pairs: {len(all_qas_en)}")

For English data:
	Number of Contexts: 9916
	Number of QA pairs: 11590


## Chinese data

In [6]:
file_path = './data/MLQA/test/test-context-zh-question-zh.json'

with open(file_path, 'r') as f:
    data_zh = json.load(f)

# print(f"Total num: {len(data_zh)}")
# data_zh
    
keys_per_level = collect_keys(data_zh)
for level, keys in keys_per_level.items():
    print(f"Level {level}: {', '.join(keys)}")

Level 0: version, data
Level 1: title, paragraphs
Level 2: qas, context
Level 3: answers, id, question
Level 4: text, answer_start


In [7]:
all_contexts_zh = []
all_qas_zh = []

for item in data_zh["data"]:
    for paragraph in item["paragraphs"]:
        all_contexts_zh.append(paragraph["context"])
        all_qas_zh.extend(paragraph["qas"])

In [8]:
all_contexts_zh[0], all_qas_zh[0]
# print(json.dumps(all_qas_zh, indent=2))

('在电路学里，电动势（英语：electromotive force，缩写为emf）表征一些电路元件供应电能的特性。这些电路元件称为「电动势源」。电化电池、太阳能电池、燃料电池、热电装置、发电机等等，都是电动势源。电动势源所供应的能量每单位电荷是其电动势。假设，电荷',
 {'question': '各电化电池都能提供电动势？',
  'answers': [{'text': '电化电池', 'answer_start': 71}],
  'id': '465f3fb044b5c50a78a2e2f9bc94c424d1f7d039'})

In [9]:
print(f"For Chinese data:\n\tNumber of Contexts: {len(all_contexts_zh)}")
print(f"\tNumber of QA pairs: {len(all_qas_zh)}")

For Chinese data:
	Number of Contexts: 4546
	Number of QA pairs: 5137


## Check if all zh data overlap with en data

In [10]:
ids_en = [item['id'] for item in all_qas_en]
ids_zh = [item['id'] for item in all_qas_zh]
len(ids_en), len(ids_zh)

(11590, 5137)

In [11]:
count = 0
for id in ids_zh:
    if id in ids_en:
        count+=1
count

5137

```
Summary：

1.Dataset architecture：
- version
- data
    - title
    - paragraphs 
        - context 
        - qas
            - id
            - question
            - answers
                - text
                - answer_start

One paragraphs may contain multiple context, one context may have multiple qa

For English data:
	Number of Contexts: 9916
	Number of QA pairs: 11590

For Chinese data:
	Number of Contexts: 4546
	Number of QA pairs: 5137

All 5137 Chinese QAs can be matched to the English dataset by their IDs.
```

# Check data quality

## Manual check of the quality of QA data

In [12]:
data_zh['data'][random.randint(0,2429)]

{'title': '那不勒斯王国',
 'paragraphs': [{'context': '查理八世于1495年将阿方索二世赶出那不勒斯。但不久后被迫撤离。这是由于费尔南多得到费尔南多二世的支持。费尔南多恢复王权，但是于1496年去世。',
   'qas': [{'question': '费伦蒂诺什么时候死的?',
     'answers': [{'text': '1496年', 'answer_start': 64}],
     'id': '0ed544d1945001e9636bc50875837a971bae1d46'}]}]}

```
# Mark 1 for an incorrect answer, 0 for correct.
10000,00000
00000,00000
10100,10000
00000,00000
00100,00000

error rate: 0.1%

{'title': '赤色黎明',
 'paragraphs': [{'context': '《赤色黎明》（英语：Red Dawn）是由丹·布拉德利执导的一部2012年美国战争片。剧本由卡尔·埃尔斯沃斯和杰里米·帕斯（Jeremy Passmore）改编自1984年同名电影。演员阵容有克里斯·海姆斯沃斯、乔希·佩克、乔什·哈切森、阿德琳妮·帕里奇、伊莎贝尔·卢卡斯、康纳·克鲁斯和杰弗里·迪恩·摩根。影片聚焦于一群帮助家乡抵御北朝鲜入侵的年轻人。',
   'qas': [{'question': '他们想保卫哪个国家？',
     'answers': [{'text': '北朝鲜', 'answer_start': 167}],
     'id': 'd221b071d496de0aa07a11addfa5202f30edaa4c'}]}]}
```

## Add `title` to each `question`
Since the data was categorised by different topics(titles), it would result in questions that were too broad and confusing when used randomly. So the headings were added to each question to increase its readability.

In [13]:
all_qas_en[:5]

[{'question': 'Who analyzed the biopsies?',
  'answers': [{'text': 'Rutgers University biochemists', 'answer_start': 457}],
  'id': 'a4968ca8a18de16aa3859be760e43dbd3af3fce9'},
 {'question': 'who represented robert frost and walter kasza in their suit?',
  'answers': [{'text': 'George Washington University law professor Jonathan Turley',
    'answer_start': 218}],
  'id': 'f251ea56c4f1aa1df270137f7e6d89c0cc1b6ef4'},
 {'question': 'What was the law suit against Groom about',
  'answers': [{'text': 'the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation and Recovery Act (which governs handling of dangerous materials)',
    'answer_start': 826}],
  'id': '04ecd5555635bc05fd2f379d1b9027edd663cebf'},
 {'question': 'what did the complainants alleged happen to them?',
  'answers': [{'text': 'had sustained skin, liver, and respiratory injuries',
    'answer_start': 607}],
  'id': 'd066a75dbe8cd3e2b57c415a8eb54a08dc7e72a7'},


In [14]:
qas_with_title_en = []

for item in data_en["data"]:
    title = item["title"] 
    for paragraph in item["paragraphs"]:
        for qa in paragraph["qas"]:
            ''' Add title to each corresponding question '''
            # qa['question'] = f"{title}: {qa['question']}" 
            ''' Or use `title` as a new key '''
            qa['keyword'] = title
            qas_with_title_en.append(qa)  

# print(json.dumps(qas_with_title_en, indent=2))
assert len(qas_with_title_en) == len(all_qas_en)
qas_with_title_en[:5]

[{'question': 'Who analyzed the biopsies?',
  'answers': [{'text': 'Rutgers University biochemists', 'answer_start': 457}],
  'id': 'a4968ca8a18de16aa3859be760e43dbd3af3fce9',
  'keyword': 'Area 51'},
 {'question': 'who represented robert frost and walter kasza in their suit?',
  'answers': [{'text': 'George Washington University law professor Jonathan Turley',
    'answer_start': 218}],
  'id': 'f251ea56c4f1aa1df270137f7e6d89c0cc1b6ef4',
  'keyword': 'Area 51'},
 {'question': 'What was the law suit against Groom about',
  'answers': [{'text': 'the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation and Recovery Act (which governs handling of dangerous materials)',
    'answer_start': 826}],
  'id': '04ecd5555635bc05fd2f379d1b9027edd663cebf',
  'keyword': 'Area 51'},
 {'question': 'what did the complainants alleged happen to them?',
  'answers': [{'text': 'had sustained skin, liver, and respiratory injuries',
    'ans

In [15]:
all_qas_zh[:5]

[{'question': '各电化电池都能提供电动势？',
  'answers': [{'text': '电化电池', 'answer_start': 71}],
  'id': '465f3fb044b5c50a78a2e2f9bc94c424d1f7d039'},
 {'question': '哪水体有助土地如此多产？',
  'answers': [{'text': '楚河', 'answer_start': 36}],
  'id': '1aee17dd937cc1043e3ff47c38396541fc3409e5'},
 {'question': '它用来写什么类型的记录？',
  'answers': [{'text': '法律、行政和私人记录', 'answer_start': 90}],
  'id': 'c1100f360fed1386068a5dc584b875cc9aefb60a'},
 {'question': '在哪帝国期间凯提文广受使用？',
  'answers': [{'text': '莫卧儿帝国期间', 'answer_start': 28}],
  'id': '89325aff92794352bde6c064b6160e601aed56b6'},
 {'question': '爱丽丝怎样恢复她原来的身高？',
  'answers': [{'text': '经过一番努力', 'answer_start': 224}],
  'id': '9fd571d90b8081f45cfd263c961c131c257634c2'}]

In [16]:
qas_with_title_zh = []

for item in data_zh["data"]:
    title = item["title"] 
    for paragraph in item["paragraphs"]:
        for qa in paragraph["qas"]:
            ''' Add title to each corresponding question '''
            # qa['question'] = f"{title}: {qa['question']}" 
            ''' Or use `title` as a new key '''
            qa['keyword'] = title
            qas_with_title_zh.append(qa)  
            
# print(json.dumps(qas_with_title_zh, indent=2))
assert len(qas_with_title_zh) == len(all_qas_zh)
qas_with_title_zh[:5]

[{'question': '各电化电池都能提供电动势？',
  'answers': [{'text': '电化电池', 'answer_start': 71}],
  'id': '465f3fb044b5c50a78a2e2f9bc94c424d1f7d039',
  'keyword': '電動勢'},
 {'question': '哪水体有助土地如此多产？',
  'answers': [{'text': '楚河', 'answer_start': 36}],
  'id': '1aee17dd937cc1043e3ff47c38396541fc3409e5',
  'keyword': '楚河州'},
 {'question': '它用来写什么类型的记录？',
  'answers': [{'text': '法律、行政和私人记录', 'answer_start': 90}],
  'id': 'c1100f360fed1386068a5dc584b875cc9aefb60a',
  'keyword': '凱提文'},
 {'question': '在哪帝国期间凯提文广受使用？',
  'answers': [{'text': '莫卧儿帝国期间', 'answer_start': 28}],
  'id': '89325aff92794352bde6c064b6160e601aed56b6',
  'keyword': '凱提文'},
 {'question': '爱丽丝怎样恢复她原来的身高？',
  'answers': [{'text': '经过一番努力', 'answer_start': 224}],
  'id': '9fd571d90b8081f45cfd263c961c131c257634c2',
  'keyword': '爱丽丝梦游仙境'}]

# Output data

In [18]:
# Add id for context
all_contexts_with_id_en = [{"id": i, "context": context} for i, context in enumerate(all_contexts_en)]

with open('./data/Context_EN.json', 'w') as file:
    json.dump(all_contexts_with_id_en, file, indent=2)

all_contexts_with_id_zh = [{"id": i, "context": context} for i, context in enumerate(all_contexts_zh)]

with open('./data/Context_ZH.json', 'w') as file:
    json.dump(all_contexts_with_id_zh, file, indent=2)

In [20]:
with open('./data/QA_EN.json', 'w') as file:
    json.dump(qas_with_title_en, file, indent=2)

with open('./data/QA_ZH.json', 'w') as file:
    json.dump(qas_with_title_zh, file, indent=2)