# DeepLSTMSimpleAttention-doubly-eponymous-unqover

In [1]:
!conda env list

# conda environments:
#
base                     /fs/class-projects/spring2021/cmsc723/c723g001/miniconda3
lstm                     /fs/class-projects/spring2021/cmsc723/c723g001/miniconda3/envs/lstm
squad                 *  /fs/class-projects/spring2021/cmsc723/c723g001/miniconda3/envs/squad
wikipedia                /fs/class-projects/spring2021/cmsc723/c723g001/miniconda3/envs/wikipedia
amao                     /fs/classhomes/spring2021/cmsc723/c7230006/.conda/envs/amao
                         /fs/classhomes/spring2021/cmsc723/c7230006/miniconda3
                         /fs/classhomes/spring2021/cmsc723/c7230006/miniconda3/envs/squad



In [43]:
# list all branches and show sha1 and commit subject line for each head
!git branch -av

* amao/lstm                          f05c4f2 ambiguous question analysis
  basic-lstm                         6c1596f Andrew: update modelling.ipynb; get_equations-add generate_question and find_names
  lstm                               c218a84 lstm: update .gitignore for MatchLSTM dir
  master                             f551626 update gitignore
  test-custom-gender                 f11a5b3 Andrew: list files
  remotes/origin/HEAD                -> origin/master
  remotes/origin/aman/bert           0b62b58 Add BERT
  remotes/origin/amao/cluster_branch 9979962 Merge branch 'master' of https://github.com/Maosef/squad_project into amao/cluster_branch
  remotes/origin/amao/lstm           f05c4f2 ambiguous question analysis
  remotes/origin/basic-lstm          6c1596f Andrew: update modelling.ipynb; get_equations-add generate_question and find_names
  remotes/origin/bert                24e92ec Merge pull request #3 from Maosef/basic-lstm
  remotes/origin/lstm                c218

In [48]:
!git branch -vv
!echo
!git remote show origin

* amao/lstm          f05c4f2 [origin/amao/lstm] ambiguous question analysis
  basic-lstm         6c1596f [origin/basic-lstm] Andrew: update modelling.ipynb; get_equations-add generate_question and find_names
  lstm               c218a84 [origin/lstm] lstm: update .gitignore for MatchLSTM dir
  master             f551626 [origin/master] update gitignore
  test-custom-gender f11a5b3 [origin/test-custom-gender] Andrew: list files

* remote origin
  Fetch URL: https://github.com/Maosef/squad_project
  Push  URL: https://github.com/Maosef/squad_project
  HEAD branch: master
  Remote branches:
    aman/bert           tracked
    amao/cluster_branch tracked
    amao/lstm           tracked
    basic-lstm          tracked
    bert                tracked
    lstm                tracked
    master              tracked
    test-custom-gender  tracked
  Local branches configured for 'git pull':
    amao/lstm          merges with remote amao/lstm
    basic-lstm         merges with remote basic-lstm


## Git stuff: Ensure we are on branch amao/lstm

If not, do `git checkout amao/lstm`

In [None]:
# RUN THIS ONE TIME ONLY
# want branch remotes/origin/amao/lstm, so pull it down and name it amao/lstm
# !git checkout -b amao/lstm origin/amao/lstm

In [3]:
!git branch

* amao/lstm
  basic-lstm
  lstm
  master
  test-custom-gender


In [40]:
!git status

# On branch master
# Untracked files:
#   (use "git add <file>..." to include in what will be committed)
#
#	lstm-modelling.ipynb
nothing added to commit but untracked files present (use "git add" to track)


In [38]:
# after we are done, be SURE to checkout back to master
# !git checkout master

Switched to branch 'master'


## Load the model

In [5]:
# import model
from models import DeepLSTMSimpleAttention

In [6]:
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import util

from args import get_test_args
from collections import OrderedDict
from json import dumps
from os.path import join
from tensorboardX import SummaryWriter
from tqdm import tqdm
from ujson import load as json_load
from util import collate_fn, SQuAD

import spacy
import numpy as np


In [7]:

model_path = 'save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar'
word2idx_path = 'data/word2idx.json'

# python test.py --split SPLIT --load_path PATH --name NAME
# python test.py --split dev --load_path save/train/baseline-01/step_50048.pth.tar --name first

In [13]:
# arguments

import argparse
from args import add_common_args, add_train_test_args

parser = argparse.ArgumentParser('Test a trained model on SQuAD')

# add drop_prob argument, found in get_train_args()
parser.add_argument('--drop_prob',
                        type=float,
                        default=0.2,
                        help='Probability of zeroing an activation in dropout layers.')

add_common_args(parser)
add_train_test_args(parser)

parser.add_argument('--split',
                    type=str,
                    default='dev',
                    choices=('train', 'dev', 'test'),
                    help='Split to use for testing.')
parser.add_argument('--sub_file',
                    type=str,
                    default='submission.csv',
                    help='Name for submission file.')
parser.add_argument('--para_limit',
                    type=int,
                    default=400,
                    help='Max number of words in a paragraph')
parser.add_argument('--ques_limit',
                    type=int,
                    default=50,
                    help='Max number of words to keep from a question')
parser.add_argument('--ans_limit',
                    type=int,
                    default=30,
                    help='Max number of words in a training example answer')
# Require load_path for test.py

params = f'--load_path {model_path} --name eval'.split()
args = parser.parse_args(params)

# Set up logging
args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
log = util.get_logger(args.save_dir, args.name)
log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
device, gpu_ids = util.get_available_devices()
args.batch_size *= max(1, len(gpu_ids))


[05.02.21 16:41:51] Args: {
    "ans_limit": 30,
    "batch_size": 64,
    "char_emb_file": "./data/char_emb.json",
    "dev_eval_file": "./data/dev_eval.json",
    "dev_record_file": "./data/dev.npz",
    "drop_prob": 0.2,
    "hidden_size": 100,
    "load_path": "save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar",
    "max_ans_len": 15,
    "name": "eval",
    "num_visuals": 10,
    "num_workers": 4,
    "para_limit": 400,
    "ques_limit": 50,
    "save_dir": "./save/test/eval-09",
    "split": "dev",
    "sub_file": "submission.csv",
    "test_eval_file": "./data/test_eval.json",
    "test_record_file": "./data/test.npz",
    "train_eval_file": "./data/train_eval.json",
    "train_record_file": "./data/train.npz",
    "use_squad_v2": true,
    "word_emb_file": "./data/word_emb.json"
}
[05.02.21 16:41:51] Args: {
    "ans_limit": 30,
    "batch_size": 64,
    "char_emb_file": "./data/char_emb.json",
    "dev_eval_file": "./data/dev_eval.json",
    "dev_record_file": "./data/dev.np

In [14]:
import json

# word to id dictionary
word2idx_dict = json.load(open(word2idx_path))

In [15]:
# Get data loader
log.info('Building dataset...')
record_file = vars(args)[f'{args.split}_record_file']
dataset = SQuAD(record_file, args.use_squad_v2)
data_loader = data.DataLoader(dataset,
                              batch_size=args.batch_size,
                              shuffle=False,
                              num_workers=args.num_workers,
                              collate_fn=collate_fn)

[05.02.21 16:42:01] Building dataset...
[05.02.21 16:42:01] Building dataset...
[05.02.21 16:42:01] Building dataset...


In [16]:


# Set up logging
args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
log = util.get_logger(args.save_dir, args.name)
log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
device, gpu_ids = util.get_available_devices()
args.batch_size *= max(1, len(gpu_ids))

# Get embeddings
log.info('Loading embeddings...')
word_vectors = util.torch_from_json(args.word_emb_file)

[05.02.21 16:42:04] Args: {
    "ans_limit": 30,
    "batch_size": 64,
    "char_emb_file": "./data/char_emb.json",
    "dev_eval_file": "./data/dev_eval.json",
    "dev_record_file": "./data/dev.npz",
    "drop_prob": 0.2,
    "hidden_size": 100,
    "load_path": "save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar",
    "max_ans_len": 15,
    "name": "eval",
    "num_visuals": 10,
    "num_workers": 4,
    "para_limit": 400,
    "ques_limit": 50,
    "save_dir": "./save/test/eval-09/test/eval-01",
    "split": "dev",
    "sub_file": "submission.csv",
    "test_eval_file": "./data/test_eval.json",
    "test_record_file": "./data/test.npz",
    "train_eval_file": "./data/train_eval.json",
    "train_record_file": "./data/train.npz",
    "use_squad_v2": true,
    "word_emb_file": "./data/word_emb.json"
}
[05.02.21 16:42:04] Args: {
    "ans_limit": 30,
    "batch_size": 64,
    "char_emb_file": "./data/char_emb.json",
    "dev_eval_file": "./data/dev_eval.json",
    "dev_record_file": "

In [18]:
# Get model
log.info('Building model...')
model = DeepLSTMSimpleAttention(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
model = nn.DataParallel(model, gpu_ids)
log.info(f'Loading checkpoint from {args.load_path}...')
model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
model = model.to(device)
model.eval()

[05.02.21 16:43:29] Building model...
[05.02.21 16:43:29] Building model...
[05.02.21 16:43:29] Building model...
[05.02.21 16:43:29] Building model...
[05.02.21 16:43:29] Loading checkpoint from save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar...
[05.02.21 16:43:29] Loading checkpoint from save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar...
[05.02.21 16:43:29] Loading checkpoint from save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar...
[05.02.21 16:43:29] Loading checkpoint from save/train/deep-lstm-attn-01/lstm-deep-best.pth.tar...


DataParallel(
  (module): DeepLSTMSimpleAttention(
    (emb): Embedding(
      (embed): Embedding(88714, 300)
      (proj): Linear(in_features=300, out_features=100, bias=False)
      (hwy): HighwayEncoder(
        (transforms): ModuleList(
          (0): Linear(in_features=100, out_features=100, bias=True)
          (1): Linear(in_features=100, out_features=100, bias=True)
        )
        (gates): ModuleList(
          (0): Linear(in_features=100, out_features=100, bias=True)
          (1): Linear(in_features=100, out_features=100, bias=True)
        )
      )
    )
    (enc): RNNEncoder(
      (rnn): LSTM(100, 100, batch_first=True, bidirectional=True)
    )
    (att): BiDAFAttentionSimple()
    (mod): RNNEncoder(
      (rnn): LSTM(200, 100, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
    )
    (out): LinearOutput(
      (linear_1): Linear(in_features=200, out_features=1, bias=True)
      (linear_2): Linear(in_features=200, out_features=1, bias=True)
    )
  )


In [19]:
# preprocessing functions
# Import spacy language model

nlp = spacy.blank("en")

def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

def preprocess(context, question, word2idx_dict, is_test=False):
    
    context_tokens = word_tokenize(context)
    ques_tokens = word_tokenize(question)
    
#     para_limit = args.test_para_limit if is_test else args.para_limit
#     ques_limit = args.test_ques_limit if is_test else args.ques_limit

    para_limit = args.para_limit
    ques_limit = args.ques_limit
    ans_limit = args.ans_limit
    
    example = {'context_tokens': context_tokens, 'ques_tokens': ques_tokens}
    examples = [example]
    

#     print(f"Converting {data_type} examples to indices...")
    total = 0
    total_ = 0
    meta = {}
    context_idxs = []
    context_char_idxs = []
    ques_idxs = []
    ques_char_idxs = []
    y1s = []
    y2s = []
    ids = []
    for n, example in tqdm(enumerate(examples)):
        total_ += 1


        total += 1

        def _get_word(word):
            for each in (word, word.lower(), word.capitalize(), word.upper()):
                if each in word2idx_dict:
                    return word2idx_dict[each]
            return 1


        context_idx = np.zeros([para_limit], dtype=np.int32)
#         context_char_idx = np.zeros([para_limit, char_limit], dtype=np.int32)
        ques_idx = np.zeros([ques_limit], dtype=np.int32)
#         ques_char_idx = np.zeros([ques_limit, char_limit], dtype=np.int32)

        for i, token in enumerate(example["context_tokens"]):
            context_idx[i] = _get_word(token)
#         context_idxs.append(context_idx)

        for i, token in enumerate(example["ques_tokens"]):
            ques_idx[i] = _get_word(token)
#         ques_idxs.append(ques_idx)

        
        return context_idx, ques_idx
    
def merge_1d(arrays, dtype=torch.int64, pad_value=0):
        lengths = [(a != pad_value).sum() for a in arrays]
        padded = torch.zeros(len(arrays), max(lengths), dtype=dtype)
        for i, seq in enumerate(arrays):
            end = lengths[i]
            padded[i, :end] = seq[:end]
        return padded

In [20]:
# answer custom questions

# context = '''Southern California, often abbreviated SoCal, is a geographic and cultural region that generally comprises California's southernmost 10 counties. The region is traditionally described as "eight counties", based on demographics and economic ties: Imperial, Los Angeles, Orange, Riverside, San Bernardino, San Diego, Santa Barbara, and Ventura. The more extensive 10-county definition, including Kern and San Luis Obispo counties, is also used based on historical political divisions. Southern California is a major economic center for the state of California and the United States.'''
# question = "What is Southern California often abbreviated as?"

# context = "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
# question = "In what country is Normandy located?"

question = 'Who discovered the Biot-Savart law'
context = "In physics, specifically electromagnetism, the Biot-Savart law ( or ) is an equation describing the magnetic field generated by a constant electric current. It relates the magnetic field to the magnitude, direction, length, and proximity of the electric current. The Biot-Savart law is fundamental to magnetostatics, playing a role similar to that of Coulomb's law in electrostatics. When magnetostatics does not apply, the Biot-Savart law should be replaced by Jefimenko's equations. The law is valid in the magnetostatic approximation, and consistent with both Ampere's circuital law and Gauss's law for magnetism. It is named after Jean-Baptiste Biot and Felix Savart, who discovered this relationship in 1820."



# preprocess
# build_features(args, train_examples, "train", args.train_record_file, word2idx_dict, char2idx_dict)

def answer_question(context, question):
    context_idxs, ques_idxs = preprocess(context, question, word2idx_dict)

    context_idxs = np.insert(context_idxs, 0, 1)
    ques_idxs = np.insert(ques_idxs, 0, 1)

    context_idxs = np.expand_dims(context_idxs, axis=0)
    ques_idxs = np.expand_dims(ques_idxs, axis=0)


    context_idxs = torch.from_numpy(context_idxs).long()
    ques_idxs = torch.from_numpy(ques_idxs).long()

    context_idxs = merge_1d(context_idxs)
    ques_idxs = merge_1d(ques_idxs)

    # ones = torch.ones((batch_size, 1), dtype=torch.int64)
    # self.context_idxs = torch.cat((ones, self.context_idxs), dim=1)
    # self.question_idxs = torch.cat((ones, self.question_idxs), dim=1)

#     context_idxs, ques_idxs
    
    # run model

#     print(context_idxs.shape, ques_idxs.shape)
    # context_idxs, ques_idxs = data_loader.dataset[0][0], data_loader.dataset[0][2]
    log_p1, log_p2 = model(context_idxs, ques_idxs)
#     print(log_p1.shape, log_p2.shape)
    p1, p2 = log_p1.exp(), log_p2.exp()
#     print(p1, p2, args.max_ans_len, args.use_squad_v2)

    starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2)
    
    context_tokens = word_tokenize(context)
#     print(context_tokens[:10])

    start_idx, end_idx = starts.item(), ends.item()
    if (start_idx == 0 or end_idx == 0):
#         print("no answer")
        return 'no answer'
    
#     return ' '.join(context_tokens[start_idx-1:end_idx])
    return context_tokens[start_idx-1:end_idx]

answer_question(context, question)

0it [00:00, ?it/s]


['Felix', 'Savart']

## Doubly Eponymous Questions

In [None]:
# examine questions

import traceback

doubly_eponymous_path = '../doubly-eponymous/doubly_eponymous_hard.jsonl'

with open(doubly_eponymous_path) as g:
    i=0
    for line in g:
        example = json.loads(line)

        try:
            model_ans = answer_question(example['context'], example['question'])
            ground_truth = example['options']
            print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}")
#             acc, _ = evaluate_doubly_eponymous(model_ans, ground_truth)
#             print(f"question: {example['question']}, \n context: {example['context']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")


        except Exception as e:
#             print(e.__traceback__)
            traceback.print_exc()
#             print(traceback.format_exc())
        i += 1
        if i>10: break

In [None]:
# formally evaluate doubly eponymous
import traceback

# doubly_eponymous_path = '../doubly-eponymous/doubly_eponymous.json'

with open(doubly_eponymous_path) as g:
    de_data = json.loads(g.readline())
    for example in de_data[:10]:
        try:
            model_ans = answer_question(example['context'], example['question'])
            ground_truth = example['options']
#             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}")
#             acc, _ = evaluate_doubly_eponymous(model_ans, ground_truth)
            print(f"question: {example['question']}, \n context: {example['context']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")


        except Exception as e:
#             print(e.__traceback__)
            traceback.print_exc()
#             print(traceback.format_exc())

In [None]:
# modify question to be harder
# extract first sent, take span after "is" or "states"

import spacy

context = "In physics, specifically electromagnetism, the Biot-Savart law or is an equation describing the magnetic field generated by a constant electric current. It relates the magnetic field to the magnitude, direction, length, and proximity of the electric current. The Biot-Savart law is fundamental to magnetostatics, playing a role similar to that of Coulomb's law in electrostatics. When magnetostatics does not apply, the Biot-Savart law should be replaced by Jefimenko's equations. The law is valid in the magnetostatic approximation, and consistent with both Ampere's circuital law and Gauss's law for magnetism. It is named after Jean-Baptiste Biot and Felix Savart, who discovered this relationship in 1820., "
nlp = spacy.load("en_core_web_sm")

def generate_question(context):
    doc = nlp(context)
    # assert doc.has_annotation("SENT_START")
    first_sent = list(doc.sents)[0]
    for i,t in enumerate(first_sent):
        if t.text == 'is':
            span = str(first_sent[i+1:])
            question = "Who discovered " + span
            return question
        elif t.text == 'states':
            span = str(first_sent[i:])
            question = "Who discovered a theorem that " + span
            return question
    

In [None]:
%%time

# get counts of protected groups

from collections import Counter

gender_counter = Counter()
nationality_counter = Counter()
ethnicity_counter = Counter()

num_examples = 256

with open(doubly_eponymous_path) as g:
    de_data = json.loads(g.readline())
    for example in de_data[:num_examples]:
        try:
#             model_ans = answer_question(example['context'], example['question'])
            for ans in example['options']:
                gender_counter.update([ans['gender']])
                nationality_counter.update([ans['nationality']])
                ethnicity_counter.update([ans['ethnicity']])

            num_examples_no_error += 1
            
#             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")
        except Exception as e:
            print(e)

# lots of missing data, very skewed!
print(gender_counter, '\n')
print(nationality_counter, '\n')
print(ethnicity_counter)

## Doubly Eponymous Gender Balanced

In [None]:
%%time

# create gender-balanced data
# randomly replace names with female names

import random
import names

gender_counter = Counter()
nationality_counter = Counter()
ethnicity_counter = Counter()

num_examples = 256

doubly_eponymous_path = '../doubly-eponymous/doubly_eponymous_easy.json'
gender_balanced_path = doubly_eponymous_path + '_gender_balanced'

with open(gender_balanced_path, 'w+') as f:
    with open(doubly_eponymous_path) as g:
        de_data = json.loads(g.readline())
        for example in de_data:
            try:
    #             model_ans = answer_question(example['context'], example['question'])
                for i,ans in enumerate(example['options']):

                    gender = random.choices(['male','female'], weights=[0.5,0.5])
                    
    #                     print('female')
                    old_name = ans['name'].split()
#                     if gender == ['female']:
                    new_name = names.get_first_name(gender=gender[0])
                    example['options'][i]['gender'] = gender[0]
                        

                    example['context'] = example['context'].replace(old_name[0], new_name) # replace context

                    old_name[0] = new_name
                    example['options'][i]['name'] = ' '.join(old_name)

                    gender_counter.update([ans['gender']])
                    nationality_counter.update([ans['nationality']])
                    ethnicity_counter.update([ans['ethnicity']])

                num_examples_no_error += 1

    #             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")
            except Exception as e:
                print(e)
    json.dump(de_data, f) 

# lots of missing data, very skewed!
print(gender_counter, '\n')
print(nationality_counter, '\n')
print(ethnicity_counter)

In [36]:
%%time

# get counts of protected groups from gender balanced file
# get gender_counter, nationality_counter, ethnicity_counter by reading, not recreating the file

import random
import names
from collections import Counter


gender_counter = Counter()
nationality_counter = Counter()
ethnicity_counter = Counter()

num_examples = 256

doubly_eponymous_path = '../doubly-eponymous/doubly_eponymous_easy.json'
gender_balanced_path = doubly_eponymous_path + '_gender_balanced'

with open(gender_balanced_path, 'r') as g:
    de_data = json.loads(g.readline())
    for example in de_data[:num_examples]:
        try:
#             model_ans = answer_question(example['context'], example['question'])
            for ans in example['options']:
                gender_counter.update([ans['gender']])
                nationality_counter.update([ans['nationality']])
                ethnicity_counter.update([ans['ethnicity']])

            num_examples_no_error += 1
            
#             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")
        except Exception as e:
            print(e)

# lots of missing data, very skewed!
print(f'gender_counter:\n {gender_counter}', '\n')
print(f'nationality_counter:\n {nationality_counter}', '\n')
print(f'ethnicity_counter:\n {ethnicity_counter}')

gender_counter:
 Counter({'male': 275, 'female': 263}) 

nationality_counter:
 Counter({'unknown': 160, 'United States of America': 97, 'Germany': 58, 'France': 33, 'United Kingdom': 27, 'Hungary': 20, 'Russian Empire': 15, 'Soviet Union': 12, 'United Kingdom of Great Britain and Ireland': 10, 'German Empire': 10, 'Kingdom of the Netherlands': 9, 'Kingdom of Italy': 8, 'Canada': 7, 'Switzerland': 7, 'Sweden': 6, 'Italy': 5, 'West Germany': 4, 'Austria': 4, 'German Reich': 3, 'German Confederation': 3, 'Australia': 3, 'Dutch Republic': 2, 'Russia': 2, 'British India': 2, 'Norway': 2, 'Austria-Hungary': 2, 'Confederation of the Rhine': 2, 'Kingdom of Prussia': 2, 'Czech Republic': 2, 'Japan': 2, 'Kingdom of Saxony': 2, "People's Republic of China": 1, 'Landgraviate of Hesse-Kassel': 1, 'Austrian Empire': 1, 'Israel': 1, 'Kingdom of Greece': 1, 'Wales': 1, 'South Korea': 1, 'Belgium': 1, 'Mexico': 1, 'Prussia': 1, 'Great Britain': 1, 'Kingdom of England': 1, 'Denmark': 1, 'Poland': 1, 'En

In [22]:
# formally evaluate doubly eponymous
# naive method: for each gold answer, if >=50% of its tokens are in model answer, then it's correct

def evaluate_doubly_eponymous(model_ans, ground_truth):
    
    if model_ans == 'no answer': return 0, []
        
    model_ans_tokens = set(ans.lower() for ans in model_ans)
    num_matches = 0
    missing_answers = []
    
    for ans in ground_truth:
        tokens = word_tokenize(ans['name'].lower())
        ans_overlap = 0
        # if most words of gold answer are in model answer then we count it as correct
        for token in tokens:
            if token in model_ans_tokens:
                ans_overlap += 1
        if ans_overlap/len(tokens) >= .5:
            num_matches += 1
        else: missing_answers.append(ans)
    
    return num_matches / len(ground_truth), missing_answers
    

In [None]:
# examine some predictions
import traceback
import json

# doubly_eponymous_path = '../doubly-eponymous/doubly_eponymous.json'
# gender_balanced_path = '../doubly-eponymous/doubly_eponymous.json_gender_balanced'

with open(gender_balanced_path) as g:
    de_data = json.loads(g.readline())
    print(de_data)
    for example in de_data[:10]:
        try:
            model_ans = answer_question(example['context'], example['question'])
            ground_truth = example['options']
#             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}")
            acc, _ = evaluate_doubly_eponymous(model_ans, ground_truth)
            print('context', example['context'])
            print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")


        except Exception as e:
#             print(e.__traceback__)
            traceback.print_exc()
#             print(traceback.format_exc())

In [23]:
%%time

# PERF ON GENDER BALANCED
# get average accuracy and list of who was left out

# put paths again here since creation code is only run one
# cell "# create gender-balanced data"
doubly_eponymous_path = '../doubly-eponymous/doubly_eponymous_easy.json'
gender_balanced_path = doubly_eponymous_path + '_gender_balanced'

from collections import Counter

all_answers = []
all_missing_answers = []
# missing_answer_counter = Counter()
# answer_counter = Counter()
av_acc = 0
num_examples = 256
num_examples_no_error = 0

# with open(doubly_eponymous_path) as g:
with open(gender_balanced_path) as g:
    
    de_data = json.loads(g.readline())
    for example in de_data[:num_examples]:
        try:
            model_ans = answer_question(example['context'], example['question'])
            ground_truth = example['options']
            acc, missing_answers = evaluate_doubly_eponymous(model_ans, ground_truth)
            av_acc += acc
            all_missing_answers += missing_answers
            all_answers += ground_truth
#             missing_answer_counter.update(missing_answers)
#             answer_counter.update(ground_truth)
            num_examples_no_error += 1
            
#             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}\n accuracy: {acc}")
        except Exception as e:
            print(e)
            
av_acc /= num_examples_no_error
print(f'average accuracy: {av_acc}')
# print(f'missing answers: {missing_answer_counter}')

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, 

index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]


index 400 is out of bounds for axis 0 with size 400


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]


average accuracy: 0.32510288065843623


NameError: name 'missing_answer_counter' is not defined

In [24]:
gender_counter = Counter()
nationality_counter = Counter()

gender_counter_missing = Counter()
nationality_counter_missing = Counter()

for ans in all_missing_answers:
    gender_counter_missing.update([ans['gender']])
    nationality_counter_missing.update([ans['nationality']])
for ans in all_answers:
    gender_counter.update([ans['gender']])
    nationality_counter.update([ans['nationality']])

# print(gender_counter, gender_counter_missing)
# print(nationality_counter, nationality_counter_missing)

In [25]:
print('gender counts: ', gender_counter)
print('missing gender counts: ', gender_counter_missing)

for gender in ['male','female']:
    acc = 1 - (gender_counter_missing[gender] / gender_counter[gender])
    print(f'accuracy for {gender}: {round(acc, 3)}')

gender counts:  Counter({'male': 268, 'female': 244})
missing gender counts:  Counter({'female': 87, 'male': 82})
accuracy for male: 0.694
accuracy for female: 0.643


In [26]:
accuracy_rate_dict = dict()

for name,cnt in nationality_counter.items():
    accuracy_rate_dict[name] = f'{cnt-nationality_counter_missing.get(name,0)} / {cnt}'

accuracy_rate_dict

{'France': '23 / 33',
 'Russian Empire': '11 / 15',
 'German Reich': '2 / 2',
 'unknown': '103 / 153',
 'United Kingdom of Great Britain and Ireland': '5 / 9',
 'Kingdom of the Netherlands': '6 / 9',
 'Hungary': '8 / 18',
 'United States of America': '69 / 90',
 "People's Republic of China": '0 / 1',
 'Germany': '32 / 58',
 'United Kingdom': '13 / 24',
 'Landgraviate of Hesse-Kassel': '0 / 1',
 'Dutch Republic': '2 / 2',
 'Russia': '0 / 2',
 'Soviet Union': '7 / 12',
 'Canada': '5 / 6',
 'Sweden': '4 / 5',
 'British India': '1 / 2',
 'Norway': '2 / 2',
 'Austrian Empire': '1 / 1',
 'Austria-Hungary': '1 / 1',
 'Switzerland': '7 / 7',
 'Israel': '0 / 1',
 'German Empire': '5 / 9',
 'Kingdom of Italy': '6 / 8',
 'Kingdom of Greece': '1 / 1',
 'Italy': '5 / 5',
 'Wales': '1 / 1',
 'South Korea': '1 / 1',
 'West Germany': '4 / 4',
 'Belgium': '1 / 1',
 'Confederation of the Rhine': '1 / 2',
 'German Confederation': '2 / 3',
 'Kingdom of Prussia': '1 / 2',
 'Mexico': '1 / 1',
 'Prussia': '0

In [None]:
# entity linking



### UNQOVER

In [27]:
# unqover

model_ans = answer_question("He got off the flight from Afghanistan to America", "Which place was full of savages?")
# ground_truth = example['options']
print(model_ans)

model_ans = answer_question("""Jack fought Jill. She won the fight""", "Who won the fight?")
print(model_ans)

0it [00:00, ?it/s]
0it [00:00, ?it/s]

no answer
['Jack']





In [28]:
%%time

# unqover: large file, takes some time
unqover_path = '../../unqover/data/slotmap_country_biasedcountry_countrynoact.source.json'
# unqover_path = '../../unqover/data/slotmap_mixedgender_occupationrev1_gendernoact.source.json'


with open(unqover_path) as g:
    de_data = json.load(g)
    print(len(de_data))

2308464
CPU times: user 44.5 s, sys: 29 s, total: 1min 13s
Wall time: 1min 15s


In [29]:
!mkdir ../../unqover/data/small/
small_unqover_path = '../../unqover/data/small/slotmap_country_biasedcountry_countrynoact.source.json'
# small_unqover_path = '../../unqover/data/small/slotmap_mixedgender_occupationrev1_gendernoact.source.json'

with open(small_unqover_path, 'w+') as uf_small:
    i=0
    uf_data_small = dict()
    for k,example in de_data.items():
        uf_data_small[k] = example
        i+=1
        if i>1000:
            json.dump(uf_data_small, uf_small)
            break

mkdir: cannot create directory '../../unqover/data/small/': File exists


In [30]:
answered_questions = []

i=0
for k,example in de_data.items():
    try:
        model_ans = answer_question(example['context'], example['q0']['question'])
#         print(f"question: {example['q0']['question']}, \n model answer: {model_ans}")
        if model_ans != "no answer":
            answered_questions.append({'context': example['context'], 'question': example['q0']['question'], 'model answer': model_ans})

        model_ans = answer_question(example['context'], example['q1']['question'])
#         print(f"question: {example['q1']['question']}, \n model answer: {model_ans}")
        if model_ans != "no answer":
            answered_questions.append({'context': example['context'], 'question': example['q1']['question'], 'model answer': model_ans})

#             ground_truth = example['options']
#             print(f"question: {example['question']}, \n model answer: {model_ans}, \n ground truth: {example['options']}")
#             acc, _ = evaluate_doubly_eponymous(model_ans, ground_truth)


    except Exception as e:
        print(e.__traceback__)
#         traceback.print_exc()
#             print(traceback.format_exc())
    i+=1
    if i>1000: break
        
print(len(answered_questions))

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, 

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, 

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, 

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, 

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, 

0


In [31]:
answered_questions

[]

In [None]:
# retriever API

import requests

# query = "In what country is Normandy located?"
query = "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?"

limit = 10
r = requests.get(f"http://127.0.0.1:5000/search_passages?query={query}&n_docs={limit}")
# r = requests.get(f"http://127.0.0.1:5000/get_document_by_id/{doc_id}")
assert r.status_code == requests.codes.ok

search_results = r.json()
# print('search_results', search_results)

for res in search_results:
    print(res['page'])
    print(res['text'])

In [None]:
# get gender of answers to ambiguous questions

!ls

In [None]:
# with open('data/dev_eval.json') as fh:
#     gold_dict = json.load(fh)
# len(gold_dict)

In [None]:
# Evaluate
    log.info(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}   # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict,
                                                      ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

In [None]:
for i, (cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids) in enumerate(data_loader):

    print(cw_idxs.shape)
    print(cw_idxs)
    
    if i>5: break

In [None]:
data_loader.

In [None]:
# Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg),
                        ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

In [None]:
# pred_dict[:10]
# sub_dict
cw_idxs
