# Study on HotpotQA

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import os
from os import path

import sys
sys.path.append("./../src")

cache_path = path.join(os.getcwd(), '..', '.cache')
tmp_path = path.join('.cache', '2022-07-26')
os.makedirs(tmp_path,exist_ok=True)

In [2]:
from datasets import load_dataset

train_set= load_dataset('hotpot_qa', 'distractor', split='train', cache_dir=cache_path)

Reusing dataset hotpot_qa (/Users/dunguyen/Projects/explanation_on_pair_sequences_task/notebooks/../.cache/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


In [3]:
df_train = train_set.to_pandas()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90447 entries, 0 to 90446
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                90447 non-null  object
 1   question          90447 non-null  object
 2   answer            90447 non-null  object
 3   type              90447 non-null  object
 4   level             90447 non-null  object
 5   supporting_facts  90447 non-null  object
 6   context           90447 non-null  object
dtypes: object(7)
memory usage: 4.8+ MB


In [4]:
import spacy

spacy_model = spacy.load('en_core_web_sm')

for doc in spacy_model.pipe(df_train['question'][:10]):
    print(doc)
    print([tk.lemma_ for tk in doc])

Which magazine was started first Arthur's Magazine or First for Women?
['which', 'magazine', 'be', 'start', 'first', 'Arthur', "'s", 'Magazine', 'or', 'first', 'for', 'Women', '?']
The Oberoi family is part of a hotel company that has a head office in what city?
['the', 'oberoi', 'family', 'be', 'part', 'of', 'a', 'hotel', 'company', 'that', 'have', 'a', 'head', 'office', 'in', 'what', 'city', '?']
Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
['musician', 'and', 'satirist', 'Allie', 'Goertz', 'write', 'a', 'song', 'about', 'the', '"', 'the', 'Simpsons', '"', 'character', 'Milhouse', ',', 'who', 'Matt', 'Groening', 'name', 'after', 'who', '?']
 What nationality was James Henry Miller's wife?
[' ', 'what', 'nationality', 'be', 'James', 'Henry', 'Miller', "'s", 'wife', '?']
Cadmium Chloride is slightly soluble in this chemical, it is also called what?
['Cadmium', 'Chloride', 'be', 'slightly', 'soluble', 'in

In [5]:
from transformers import AutoTokenizer

pretrained_tokenizer = [
    'gpt2', 'distilbert-base-uncased', 'bert-base-uncased', 'dslim/bert-base-NER'
]

tokenized_question = dict()

for question in df_train['question'][:5].tolist():
    print(question)

for pretrained in pretrained_tokenizer:
    print('='*30)
    tokenizer = AutoTokenizer.from_pretrained(pretrained, cache_dir=cache_path)
    for idx_question in range(5):
        print(pretrained,':', tokenizer.tokenize(df_train['question'][idx_question]))
    

Which magazine was started first Arthur's Magazine or First for Women?
The Oberoi family is part of a hotel company that has a head office in what city?
Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
 What nationality was James Henry Miller's wife?
Cadmium Chloride is slightly soluble in this chemical, it is also called what?
gpt2 : ['Which', 'Ġmagazine', 'Ġwas', 'Ġstarted', 'Ġfirst', 'ĠArthur', "'s", 'ĠMagazine', 'Ġor', 'ĠFirst', 'Ġfor', 'ĠWomen', '?']
gpt2 : ['The', 'ĠOber', 'oi', 'Ġfamily', 'Ġis', 'Ġpart', 'Ġof', 'Ġa', 'Ġhotel', 'Ġcompany', 'Ġthat', 'Ġhas', 'Ġa', 'Ġhead', 'Ġoffice', 'Ġin', 'Ġwhat', 'Ġcity', '?']
gpt2 : ['Mus', 'ician', 'Ġand', 'Ġsatir', 'ist', 'ĠAll', 'ie', 'ĠGo', 'ert', 'z', 'Ġwrote', 'Ġa', 'Ġsong', 'Ġabout', 'Ġthe', 'Ġ"', 'The', 'ĠSimpsons', '"', 'Ġcharacter', 'ĠMil', 'house', ',', 'Ġwho', 'ĠMatt', 'ĠGro', 'ening', 'Ġnamed', 'Ġafter', 'Ġwho', '?']
gpt2 : ['ĠWhat', 'Ġnationality', 'Ġwa

In [6]:
df_train['question'][:10].tolist()

["Which magazine was started first Arthur's Magazine or First for Women?",
 'The Oberoi family is part of a hotel company that has a head office in what city?',
 'Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?',
 " What nationality was James Henry Miller's wife?",
 'Cadmium Chloride is slightly soluble in this chemical, it is also called what?',
 'Which tennis player won more Grand Slam titles, Henri Leconte or Jonathan Stark?',
 "Which genus of moth in the world's seventh-largest country contains only one species?",
 'Who was once considered the best kick boxer in the world, however he has been involved in a number of controversies relating to his "unsportsmanlike conducts" in the sport and crimes of violence outside of the ring.',
 'The Dutch-Belgian television series that "House of Anubis" was based on first aired in what year?',
 'What is the length of the track where the 2013 Liqui Moly Bathurst 12 Ho

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER", cache_dir=cache_path)
ner_tokenizer = AutoTokenizer.from_pretrained('dslim/bert-base-NER', cache_dir=cache_path)

inputs = ner_tokenizer(df_train['question'][0], return_tensors="pt")
outputs = ner_model(**inputs).logits
predictions = torch.argmax(outputs, dim=2).squeeze(0)
labels = [ner_model.config.id2label[p.item()] for p in predictions]
print(labels)

['O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'O', 'O']


## Check % of Named Entity

Exemple des entités nommés dans les 10 premiers phrases dans `question`

In [8]:
docs = spacy_model.pipe(df_train['question'][:10])
for d in docs:
    print([ent.text for ent in d.ents])

["Arthur's Magazine", 'First for Women']
['Oberoi']
['Musician', 'Allie Goertz', 'the "The Simpsons', 'Milhouse', 'Matt Groening']
["James Henry Miller's"]
['Cadmium Chloride']
['Grand Slam', 'Jonathan Stark']
['seventh', 'only one']
[]
['Dutch', 'House of Anubis', 'what year']
['2013', '12 Hour']


Une entité est-elle compté comme un token?

In [6]:
docs = spacy_model.pipe(df_train['question'][:10])

print('Longueur de span d\'entité ; nombre de token par phrase')
entity_span = list()

for d in docs:
    distances = [ent.end - ent.start for ent in d.ents]
    entity_span.append(distances)
    print(distances, len(d))

Longueur de span d'entité ; nombre de token par phrase
[3, 3] 13
[1] 18
[1, 2, 4, 1, 2] 24
[4] 10
[2] 15
[2, 2] 15
[1, 2] 17
[] 41
[1, 3, 2] 21
[1, 2] 18


Pourcentage de token couvert par question si nous comptons en tokens

In [13]:
from tqdm.notebook import tqdm
import json


stats_path = path.join(tmp_path, 'stats_entity.json')
if path.exists(stats_path):
    with open(stats_path, 'r') as f:
        stats = json.load(f)
    print(f'Load stats in {stats_path}')

else:
    stats = dict()
    for split in ['train', 'validation']:
        dataset = load_dataset('hotpot_qa', 'distractor', split=split, cache_dir=cache_path)
        dataset = dataset.to_pandas()
        dataset = dataset[:30000].reset_index()
        
        stats[split] = {'question': dict(), 'context': dict()}


        n_token_entity = 0
        n_token_all = 0
        n_entity_ent1tok = 0
        n_token_ent1tok = 0

        # Stats for Question
        docs = spacy_model.pipe(dataset['question'])
        for d in tqdm(docs, total=len(dataset['question'])):
            length_entities = [ent.end - ent.start for ent in d.ents]
            n_token_entity += sum(length_entities)

            n_token_all += len(d)

            n_entity_ent1tok += len(d.ents)

            n_token_ent1tok += len(d) - sum(length_entities) + len(d.ents)

        stats[split]['question'] = {
            'n_token_entity': n_token_entity, 
            'n_token_all': n_token_all,
            'n_entity_ent1tok': n_entity_ent1tok,
            'n_token_ent1tok': n_token_ent1tok
        }

        # Stats for Context
        contexts = []
        for context in dataset['context']:
            contexts += [' '.join(c) for c in context['sentences']] 

        n_token_entity = 0
        n_token_all = 0
        n_entity_ent1tok = 0
        n_token_ent1tok = 0
        docs = spacy_model.pipe(contexts)
        for d in tqdm(docs, total=len(contexts)):
            length_entities = [ent.end - ent.start for ent in d.ents]
            n_token_entity += sum(length_entities)

            n_token_all += len(d)

            n_entity_ent1tok += len(d.ents)

            n_token_ent1tok += len(d) - sum(length_entities) + len(d.ents)

        stats[split]['context'] = {
            'n_token_entity': n_token_entity, 
            'n_token_all': n_token_all,
            'n_entity_ent1tok': n_entity_ent1tok,
            'n_token_ent1tok': n_token_ent1tok
        }
        
        stats[split]['total'] = {key: sum([stats[split][part][key] for part in ['question', 'context']]) for key in stats[split]['context'] }

    with open(stats_path, 'w') as f:
        json.dump(stats, f)
    print(f'Save stats in {stats_path}')
    
html = '<table>'

for split, split_stat in stats.items():

    html += f'<tr><th>{split}</th><th>%Token_in_entity</th><th>%Entity_as_token</th></tr>'

    for k, v in split_stat.items():
        n_token_entity = v['n_token_entity']
        n_token_all = v['n_token_all']
        n_entity_ent1tok = v['n_entity_ent1tok']
        n_token_ent1tok = v['n_token_ent1tok']
        html += f'<tr><td>{k}</td><td>{round(n_token_entity*100/n_token_all, 2)}%</td><td>{round(n_entity_ent1tok*100/n_token_ent1tok, 2)}%</td></tr>'

html += '</table>'
display(HTML(html))

Reusing dataset hotpot_qa (/Users/dunguyen/Projects/explanation_on_pair_sequences_task/notebooks/../.cache/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/30000 [00:00<?, ?it/s]

  0%|          | 0/298331 [00:00<?, ?it/s]

Reusing dataset hotpot_qa (/Users/dunguyen/Projects/explanation_on_pair_sequences_task/notebooks/../.cache/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


  0%|          | 0/7405 [00:00<?, ?it/s]

  0%|          | 0/73700 [00:00<?, ?it/s]

Save stats in .cache/2022-07-26/stats_entity.json


train,%Token_in_entity,%Entity_as_token
question,23.91%,14.0%
context,25.8%,14.89%
total,25.77%,14.87%
validation,%Token_in_entity,%Entity_as_token
question,23.21%,13.36%
context,25.64%,14.79%
total,25.6%,14.76%


## Prepare HotpotQA dataset in csv

In [9]:
training_set = load_dataset('hotpot_qa', 'distractor', split='train', cache_dir=cache_path)
training_set = training_set.train_test_split(test_size=0.3, shuffle=False)
train_set = training_set['train']
val_set = training_set['test']

Reusing dataset hotpot_qa (/Users/dunguyen/Projects/explanation_on_pair_sequences_task/notebooks/../.cache/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)


In [10]:
for t in training_set:
    print(t)

train
test


In [13]:
from tqdm.notebook import tqdm

In [14]:
import pandas as pd

data_dict = {'question': list(), 
             'context': list(),
             'facts': list(), 
             'support': list(),
            'answer': list(),
            'level': list()}

idx = 0

for row in tqdm(train_set, total=len(train_set)):
    
    _id = row['id']
    question = row['question']
    
    # find ids of context:
    facts = row['supporting_facts']
    context = row['context']
    context_length = [len(c) for c in context['sentences']]
    context_text = [' '.join(c) for c in context['sentences']]
    facts_mask = [[False]*cl for cl in context_length]
    
    for title, id_sent in zip(facts['title'], facts['sent_id']):
        id_title = context['title'].index(title)
        if id_sent >= context_length[id_title]:
            print(f'id sent not valid in [title={id_title},sent={id_sent}], id={_id}')
        else:
            facts_mask[id_title][id_sent] = True
    
    for text, length, mask in zip(context_text, context_length, facts_mask):
        data_dict['question'] += [ question ]
        data_dict['context'] += [ text ]
        data_dict['facts'] += [ mask ]
        data_dict['support'] += [any(mask)]
        data_dict['answer'] += [row['answer']]
        data_dict['level'] += [row['level']]
        
    idx += 1
    
pd.DataFrame(data_dict)

  0%|          | 0/63312 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
id sent not valid in [title=7,sent=2], id=5a7b23ca554299042af8f703
id sent not valid in [title=6,sent=20], id=5abed6d45542990832d3a0ef
id sent not valid in [title=9,sent=3], id=5ab6b2fb5542995eadef0060
id sent not valid in [title=9,sent=2], id=5ae0e2df5542990adbacf6b1
id sent not valid in [title=1,sent=4], id=5a8d6138554299585d9e37c7
id sent not valid in [title=3,sent=52], id=5ab740165542992aa3b8c7fa
id sent not valid in [title=8,sent=2], id=5ab2f812554299545a2cfaee
id sent not valid in [title=3,sent=52], id=5ae7e8ef5542994a481bbe05
id sent not valid in [title=4,sent=4], id=5ab273ee5542997061209606
id sent not valid in [title=4,sent=2], id=5a84517355429933447460d5
id sent not valid in [title=3,sent=20], id=5

Unnamed: 0,question,context,facts,support,answer,level
0,Which magazine was started first Arthur's Maga...,Radio City is India's first private FM radio s...,"[False, False, False, False, False, False, False]",False,Arthur's Magazine,medium
1,Which magazine was started first Arthur's Maga...,Football in Albania existed before the Albania...,"[False, False, False, False]",False,Arthur's Magazine,medium
2,Which magazine was started first Arthur's Maga...,"Echosmith is an American, Corporate indie pop ...","[False, False, False, False, False, False, Fal...",False,Arthur's Magazine,medium
3,Which magazine was started first Arthur's Maga...,Women's colleges in the Southern United States...,"[False, False, False, False]",False,Arthur's Magazine,medium
4,Which magazine was started first Arthur's Maga...,"The First Arthur County Courthouse and Jail, w...",[False],False,Arthur's Magazine,medium
...,...,...,...,...,...,...
629571,Who was the producer of the series of politica...,"The Angelina Jolie trapdoor spider (""Aptostich...","[False, False, False]",False,Jerry Bruckheimer,medium
629572,Who was the producer of the series of politica...,"Tiffany Claus (born July 14, 1980) is an Ameri...","[False, False, False, False]",False,Jerry Bruckheimer,medium
629573,Who was the producer of the series of politica...,Aptostichus miwok is a species of spider in th...,"[False, False]",False,Jerry Bruckheimer,medium
629574,Who was the producer of the series of politica...,"James Haven (born James Haven Voight; May 11, ...","[False, False]",False,Jerry Bruckheimer,medium


In [15]:
from data.hotpot_qa.dataset import HotpotNLIDataset

In [21]:
dataset = HotpotNLIDataset(root=path.join(cache_path, 'dataset'), split='train')

Reusing dataset hotpot_qa (/Users/dunguyen/Projects/explanation_on_pair_sequences_task/notebooks/../.cache/dataset/hotpot_qa/hotpot_qa/distractor/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5)
Formating train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 63312/63312 [00:13<00:00, 4677.13it/s]
id sent not valid in [title=7,sent=2], id=5a7b23ca554299042af8f703
id sent not valid in [title=6,sent=20], id=5abed6d45542990832d3a0ef
id sent not valid in [title=9,sent=3], id=5ab6b2fb5542995eadef0060
id sent not valid in [title=9,sent=2], id=5ae0e2df5542990adbacf6b1
id sent not valid in [title=1,sent=4], id=5a8d6138554299585d9e37c7
id sent not valid in [title=3,sent=52], id=5ab740165542992aa3b8c7fa
id sent not valid in [title=8,sent=2], id=5ab2f812554299545a2cfaee
id sent not valid in [title=3,sent=52], id=5ae7e8ef5542994a481bbe05
id sent not valid in [title=4,sent=4

In [25]:
dataset[0]['support']

False