In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv("exec.csv")

In [3]:
data.columns

Index(['Unnamed: 0', 'id', 'question', 'context', 'context_id', 'answer_start',
       'answer_text'],
      dtype='object')

In [4]:
data = data.drop(columns='Unnamed: 0')

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
data

Unnamed: 0,id,question,context,context_id,answer_start,answer_text
0,5733be284776f41900661182,To whom did the Virgin Mary allegedly appear i...,"Architecturally, the school has a Catholic cha...",0,515,Saint Bernadette Soubirous
1,5733bf84d058e614000b61be,When did the Scholastic Magazine of Notre dame...,"As at most other universities, Notre Dame's st...",1,248,September 1876
2,5733bed24776f41900661188,Where is the headquarters of the Congregation ...,The university is the major seat of the Congre...,2,119,Rome
3,5733a6424776f41900660f51,How many BS level degrees are offered in the C...,The College of Engineering was established in ...,3,487,eight
4,5733a70c4776f41900660f64,What entity provides help with the management ...,All of Notre Dame's undergraduate students are...,4,496,Learning Resource Center
...,...,...,...,...,...,...
819,57324bd1b9d445190005e9de,How often did Jehovah Witnesses congregations ...,Meetings for worship and study are held at Kin...,18479,779,three times each week
820,573255bce99e3014001e66d8,When did NYC buy land for its parks?,The northern side of the borough includes the ...,18610,1240,1888
821,5732a488d6dcfa19001e8a5b,Who quoted the line of Terence most notably?,The ad fontes principle also had many applicat...,18678,1035,Seneca
822,5735a9fbe853931400426ab2,What is the Kathmandu Valley's average tempera...,Five major climatic regions are found in Nepal...,18847,749,50.2


In [7]:
def encode_data(row):
    question = row["question"]
    context = row["context"]
    inputs = tokenizer(question, context, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
    return inputs

data["encoded_data"] = data.apply(encode_data, axis=1)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [8]:
data["input_ids"] = data["encoded_data"].apply(lambda x: x["input_ids"])
data["attention_mask"] = data["encoded_data"].apply(lambda x: x["attention_mask"])
data["token_type_ids"] = data["encoded_data"].apply(lambda x: x.get("token_type_ids"))

In [9]:
from torch.utils.data import DataLoader, TensorDataset

input_ids = torch.stack(data["input_ids"].tolist())
attention_mask = torch.stack(data["attention_mask"].tolist())
token_type_ids = torch.stack(data["token_type_ids"].tolist())

dataset = TensorDataset(input_ids, attention_mask, token_type_ids)

dataloader = DataLoader(dataset, batch_size=16)


In [10]:
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
import torch

def answer_question(question, context, model, tokenizer):
    inputs = tokenizer(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits)
    
    answer = tokenizer.decode(inputs.input_ids[0][start_idx:end_idx+1])
    
    return answer

In [12]:
sample_question = "what did the mob do"
sample_context= "mob did not keep peace"
answer = answer_question(sample_question, sample_context, model, tokenizer)
print("Answer:", answer)

Answer: not keep peace


In [13]:
import json
def convert_from_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers']):
    file = json.loads(open(file_path).read())
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    index = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = index
    data = m[['id', 'question', 'context', 'answers']].set_index('id').reset_index()
    data['context_id'] = data['context'].factorize()[0]
    return data

test_file_path = './dev-v1.1.json'
test_data = convert_from_json_to_dataframe(test_file_path)
test_data

Unnamed: 0,id,question,context,answers,context_id
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'Carolina Panth...",0
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'Santa Clara, C...",0
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",0
...,...,...,...,...,...
10565,5737aafd1c456719005744fb,What is the metric term less used than the New...,"The pound-force has a metric counterpart, less...","[{'answer_start': 82, 'text': 'kilogram-force'...",2066
10566,5737aafd1c456719005744fc,What is the kilogram-force sometimes reffered ...,"The pound-force has a metric counterpart, less...","[{'answer_start': 114, 'text': 'kilopond'}, {'...",2066
10567,5737aafd1c456719005744fd,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...","[{'answer_start': 274, 'text': 'slug'}, {'answ...",2066
10568,5737aafd1c456719005744fe,What seldom used term of a unit of force equal...,"The pound-force has a metric counterpart, less...","[{'answer_start': 712, 'text': 'kip'}, {'answe...",2066


In [14]:
import json
def convert_from_json_to_dataframe(file_path, record_path=['data', 'paragraphs', 'qas', 'answers']):
    file = json.loads(open(file_path).read())
    js = pd.json_normalize(file, record_path)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    index = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = index
    data = m[['id', 'question', 'context', 'answers']].set_index('id').reset_index()
    data['context_id'] = data['context'].factorize()[0]
    return data
    

test_file_path = 'dev-v1.1.json'
test_data = convert_from_json_to_dataframe(test_file_path)
test_data

Unnamed: 0,id,question,context,answers,context_id
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'Carolina Panth...",0
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'Santa Clara, C...",0
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'Denver Broncos...",0
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",0
...,...,...,...,...,...
10565,5737aafd1c456719005744fb,What is the metric term less used than the New...,"The pound-force has a metric counterpart, less...","[{'answer_start': 82, 'text': 'kilogram-force'...",2066
10566,5737aafd1c456719005744fc,What is the kilogram-force sometimes reffered ...,"The pound-force has a metric counterpart, less...","[{'answer_start': 114, 'text': 'kilopond'}, {'...",2066
10567,5737aafd1c456719005744fd,What is a very seldom used unit of mass in the...,"The pound-force has a metric counterpart, less...","[{'answer_start': 274, 'text': 'slug'}, {'answ...",2066
10568,5737aafd1c456719005744fe,What seldom used term of a unit of force equal...,"The pound-force has a metric counterpart, less...","[{'answer_start': 712, 'text': 'kip'}, {'answe...",2066


In [15]:
test_data.to_csv('test_data.csv')

In [16]:
test_data = pd.read_csv('./test_data.csv')

In [17]:
print("question: ")
print(test_data['question'][3])
print("\ncontext: ")
print(test_data['context'][3])
answer = answer_question(test_data['question'][3], test_data['context'][3], model, tokenizer)
print("\nanswer: ")
print(answer)
print("\noriginal answers: ")
print(test_data['answers'][3])

question: 
Which NFL team won Super Bowl 50?

context: 
Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.

answer: 
denver broncos

original answers: 
[{'answer_start': 177, 'text': 'Denver Broncos'}, {'answer_start': 177, 'text': 'Denver Broncos'}, {'answer_start': 177, 't

In [18]:
test_data.shape

(10570, 6)

In [19]:
for i in range(0, 2067):
    test_data['answers'][i] = [answer.lower() for answer in test_data['answers'][i]]
    test_data['answers'][i] = ''.join(test_data['answers'][i]).lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['answers'][i] = [answer.lower() for answer in test_data['answers'][i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['answers'][i] = ''.join(test_data['answers'][i]).lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['answers'][i] = [answer.lower() for answer in test_data['answers'][i]]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

In [20]:
test_data['answers']

0        [{'answer_start': 177, 'text': 'denver broncos...
1        [{'answer_start': 249, 'text': 'carolina panth...
2        [{'answer_start': 403, 'text': 'santa clara, c...
3        [{'answer_start': 177, 'text': 'denver broncos...
4        [{'answer_start': 488, 'text': 'gold'}, {'answ...
                               ...                        
10565    [{'answer_start': 82, 'text': 'kilogram-force'...
10566    [{'answer_start': 114, 'text': 'kilopond'}, {'...
10567    [{'answer_start': 274, 'text': 'slug'}, {'answ...
10568    [{'answer_start': 712, 'text': 'kip'}, {'answe...
10569    [{'answer_start': 665, 'text': 'sthène'}, {'an...
Name: answers, Length: 10570, dtype: object

In [21]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,id,question,context,answers,context_id
0,0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'denver broncos...",0
1,1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'carolina panth...",0
2,2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'santa clara, c...",0
3,3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'denver broncos...",0
4,4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ...",0


In [22]:
test_data = test_data.drop(columns=['Unnamed: 0', 'context_id'], axis=1)

In [23]:
test_data.head()

Unnamed: 0,id,question,context,answers
0,56be4db0acb8001400a502ec,Which NFL team represented the AFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'denver broncos..."
1,56be4db0acb8001400a502ed,Which NFL team represented the NFC at Super Bo...,Super Bowl 50 was an American football game to...,"[{'answer_start': 249, 'text': 'carolina panth..."
2,56be4db0acb8001400a502ee,Where did Super Bowl 50 take place?,Super Bowl 50 was an American football game to...,"[{'answer_start': 403, 'text': 'santa clara, c..."
3,56be4db0acb8001400a502ef,Which NFL team won Super Bowl 50?,Super Bowl 50 was an American football game to...,"[{'answer_start': 177, 'text': 'denver broncos..."
4,56be4db0acb8001400a502f0,What color was used to emphasize the 50th anni...,Super Bowl 50 was an American football game to...,"[{'answer_start': 488, 'text': 'gold'}, {'answ..."


In [24]:
test_data.shape

(10570, 4)

In [25]:
test_data.to_csv('./final_test.csv')

In [26]:
test_data = pd.read_csv('./final_test.csv')

In [27]:
#Measure accuracy
correct = 0

for i in range(0, 2067):
    if test_data['answers'][i] == answer_question(test_data['question'][i], test_data['context'][i], model, tokenizer):
        correct += 1

print("Accuracy: ", correct/2067)


KeyboardInterrupt: 

In [28]:
#Checking if the model gueeses the answer almost correctly

for i in range(0, 2067):
    if test_data['answers'][i] != answer_question(test_data['question'][i], test_data['context'][i], model, tokenizer):
        print("question: ")
        print(test_data['question'][i])
        print("\ncontext: ")
        print(test_data['context'][i])
        answer = answer_question(test_data['question'][i], test_data['context'][i], model, tokenizer)
        print("\nanswer: ")
        print(answer)
        print("\noriginal answers: ")
        print(test_data['answers'][i])
        print("\n")

question: 
Which NFL team represented the AFC at Super Bowl 50?

context: 
Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.

answer: 
denver broncos

original answers: 
[{'answer_start': 177, 'text': 'denver broncos'}, {'answer_start': 177, 'text': 'denver broncos'}, {'ans

KeyboardInterrupt: 