In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import sklearn
from sklearn import metrics
import nltk 
from collections import Counter
import matplotlib.pyplot as plt
import itertools
import ast

nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))


### BOOTLEG ###
import sys
import load_entity_profiles

[nltk_data] Downloading package stopwords to
[nltk_data]     /lfs/1/simran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /lfs/1/simran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Reading in vocab from /dfs/scratch0/lorr1/data_prep/data/wiki0516/entity_db/entity_mappings/entity_all_words/all_words_vocab.marisa
Loaded entity symbols.
Loading alias qid counts from /dfs/scratch0/lorr1/data_prep/data/wiki0516/stats/alias_qid_traindata_withaugment.json
Found 506 hyena types
Found 67110 wikidata types
FINISHED LOADING IN 85.02814197540283


In [6]:
from load_entity_profiles import es, esp, type_vocab, typeid2typename, type_vocab_wd, typeid2typename_wd, alias_qid_count, rel_to_name, du

In [7]:
# Bootleg utility functions:
# BY ALIAS: 
def get_candidates(alias):
    try:
        # To get qid candidates of an alias
        cands = esp.get_qid_cands(alias)
        return cands
    except:
        cands = esp.get_qid_cands(alias)
    
# get_candidates('distortion')

In [8]:
# Manual User Inputs
model = '/dfs/scratch1/simran/bootleg_downstream/error_outputs/baseline_bootleg_dev_rev_ent.csv' #baseline spanbert without bootleg
spanbert = '/dfs/scratch1/simran/bootleg_downstream/error_outputs/predictionsspanbert_dev_0908.csv'
spanboot = '/dfs/scratch1/simran/bootleg_downstream/error_outputs/0925_spanboot_dev_predictions.csv'

In [9]:
LABEL_TO_ID = {'no_relation': 0, 'per:title': 1, 'org:top_members/employees': 2, 'per:employee_of': 3, 
               'org:alternate_names': 4, 'org:country_of_headquarters': 5, 'per:countries_of_residence': 6, 
               'org:city_of_headquarters': 7, 'per:cities_of_residence': 8, 'per:age': 9, 
               'per:stateorprovinces_of_residence': 10, 'per:origin': 11, 'org:subsidiaries': 12, 
               'org:parents': 13, 'per:spouse': 14, 'org:stateorprovince_of_headquarters': 15, 'per:children': 16, 
               'per:other_family': 17, 'per:alternate_names': 18, 'org:members': 19, 'per:siblings': 20, 
               'per:schools_attended': 21, 'per:parents': 22, 'per:date_of_death': 23, 'org:member_of': 24, 
               'org:founded_by': 25, 'org:website': 26, 'per:cause_of_death': 27, 
               'org:political/religious_affiliation': 28, 'org:founded': 29, 'per:city_of_death': 30, 
               'org:shareholders': 31, 'org:number_of_employees/members': 32, 'per:date_of_birth': 33, 
               'per:city_of_birth': 34, 'per:charges': 35, 'per:stateorprovince_of_death': 36, 'per:religion': 37, 
               'per:stateorprovince_of_birth': 38, 'per:country_of_birth': 39, 'org:dissolved': 40, 
               'per:country_of_death': 41}

LABEL_LST = list(LABEL_TO_ID.keys())

STANFORD_NER_TYPES = ['DATE', 'LOCATION', 'MONEY', 'ORGANIZATION', 'PERCENT', 'PERSON', 'TIME']
punctuation = [',', ':', '.', ';', "'", ')', '(', "'s", '--', '-', '``', "''"]

In [10]:
# Utility functions: accepts a df with corrected slices, and outputs the predicted result
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support as score
def score_corrections(df, pred_name):
    import scorer
    scorer.score(df['relation'].tolist(), df[pred_name].tolist(), verbose=True)
    df_errs = df[df['relation'] != df[pred_name]]
    print("Number of Errors: ", df_errs.shape[0])
# score_corrections(df_results)

In [11]:
# basic error rate by relation 
def error_rate_by_relation(df_r, key, df_e):
    relation_df = pd.DataFrame(columns=['relation','error_rate','error_count','total_count'])
    index = 0
    for k, v in LABEL_TO_ID.items():
        df_relation_tot = df_r[df_r[key] == k] # all rows with k = true trelation
        tot = df_relation_tot.shape[0] # number of examples with this true relation
    
        df_relation_err = df_e[df_e[key] == k] # error rows with k = true trelation
        err = df_relation_err.shape[0] # number of errors with this true relation
        error_rate = err/tot
    
        relation_df.loc[index] = pd.Series({'relation':k, 'error_rate':error_rate, 'error_count':err, 'total_count':tot})
        index += 1
    
    print(relation_df.sort_values('relation'))

In [12]:
# Load the model data 
df_results = pd.read_csv(model)
df_results.rename(columns={'prediction':'prediction_model'}, inplace=True)

# Load the spanbert data
df_results_spanbert = pd.read_csv(spanbert)
map_id_pred = {}
for ind, row in df_results_spanbert.iterrows():
    map_id_pred[row['id']] = row['prediction']
df_results['prediction_spanbert'] = df_results['id'].map(map_id_pred)
for key in df_results.columns.values:
    if key != 'prediction':
        df_results_spanbert[key] = df_results[key]
        
        
# Load the spanboot data
df_results_spanboot = pd.read_csv(spanboot)
map_id_pred = {}
for ind, row in df_results_spanboot.iterrows():
    map_id_pred[row['id']] = row['prediction']
df_results['prediction_spanboot'] = df_results['id'].map(map_id_pred)
for key in df_results.columns.values:
    if key != 'prediction':
        df_results_spanboot[key] = df_results[key]
        
# error rates
df_errors = df_results[df_results['relation'] != df_results['prediction_model']]
df_errors_spanbert = df_results[df_results['relation'] != df_results['prediction_spanbert']]
df_errors_spanboot = df_results[df_results['relation'] != df_results['prediction_spanboot']]


print("FULL_model size: ", df_results.shape)
print("ERRS_model size: {}\n".format(df_errors.shape))

print("FULL_baseline size: ", df_results_spanbert.shape)
print("ERRS_baseline size: {}\n".format(df_errors_spanbert.shape))

print("FULL_spanboot size: ", df_results_spanboot.shape)
print("ERRS_spanboot size: ", df_errors_spanboot.shape)



FULL_model size:  (22631, 28)
ERRS_model size: (2596, 28)

FULL_baseline size:  (22631, 28)
ERRS_baseline size: (2145, 28)

FULL_spanboot size:  (22631, 29)
ERRS_spanboot size:  (1897, 28)


In [13]:
print(df_results.columns.values)

['obj' 'obj_mentions' 'subj_qids' 'prop_mentions' 'real_mentions'
 'subj_type' 'obj_type' 'id' 'subj_pos' 'subj_leng' 'obj_qids'
 'subj_mentions' 'qids' 'obj_leng' 'obj_ner' 'stanford_ner' 'subj_ner'
 'prediction_model' 'subj' 'example' 'mentions' 'separation_dist'
 'prop_ner' 'obj_pos' 'relation' 'num_ner' 'prediction_spanbert'
 'prediction_spanboot']


## Analysis by Relation

In [14]:
#error_rate_by_relation(df_results, 'relation_model', df_errors)
relation_df = pd.DataFrame(columns=['relation','errrate_span','errcount_span', 'errrate_boot','errcount_boot','total_count'])
index = 0
for k, v in LABEL_TO_ID.items():
    if df_results_spanbert[df_results_spanbert['relation'] == k].shape[0] > 0:
        df_relation_tot_span = df_results_spanbert[df_results_spanbert['relation'] == k] # all rows with k = true trelation
        tot_span = df_relation_tot_span.shape[0] # number of examples with this true relation
    
        df_relation_err_span = df_errors_spanbert[df_errors_spanbert['relation'] == k] # error rows with k = true trelation
        err_span = df_relation_err_span.shape[0] # number of errors with this true relation
        error_rate_span = err_span/tot_span
        
        df_relation_tot_boot = df_results_spanboot[df_results_spanboot['relation'] == k] # all rows with k = true trelation
        tot_boot = df_relation_tot_boot.shape[0] # number of examples with this true relation
    
        df_relation_err_boot = df_errors_spanboot[df_errors_spanboot['relation'] == k] # error rows with k = true trelation
        err_boot = df_relation_err_boot.shape[0] # number of errors with this true relation
        error_rate_boot = err_boot/tot_boot
    
        relation_df.loc[index] = pd.Series({'relation':k, 'errrate_span':error_rate_span, 'errcount_span':err_span, 'errrate_boot':error_rate_boot,'errcount_boot': err_boot, 'total_count':tot_span})
        index += 1
    
print(relation_df.sort_values('relation'))

                               relation  errrate_span errcount_span  \
0   no_relation                          0.061566      1067           
4   org:alternate_names                  0.109195      38             
7   org:city_of_headquarters             0.268519      29             
5   org:country_of_headquarters          0.410256      80             
40  org:dissolved                        1.000000      1              
29  org:founded                          0.088235      3              
25  org:founded_by                       0.146667      11             
24  org:member_of                        1.000000      7              
19  org:members                          0.125000      8              
32  org:number_of_employees/members      0.125000      3              
13  org:parents                          0.345679      28             
28  org:political/religious_affiliation  0.714286      10             
31  org:shareholders                     0.800000      28             
15  or

# Compare Model vs. Baseline Errors

We want to confirm that the improvement ties back to insights in bootleg - i.e., the mentions/types/relations in bootleg's database.

In [15]:
# get the set difference of errors
df_spanbert_only_errors = df_errors_spanbert[~df_errors_spanbert['id'].isin(df_errors_spanboot['id'])]
df_spanboot_only_errors = df_errors_spanboot[~df_errors_spanboot['id'].isin(df_errors_spanbert['id'])]
print("The #examples our model improves on vs. baseline are: ", df_spanbert_only_errors.shape[0])
print("The #examples our model does worse on vs. baseline are: ", df_spanboot_only_errors.shape[0])

# for i, row in df_spanbert_only_errors.iterrows():
#     if row['relation'] != "no_relation":
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_spanboot', 'prediction_spanbert', 'mentions', 'stanford_ner']])
#         print()
#     if i == 10:
#         break

The #examples our model improves on vs. baseline are:  949
The #examples our model does worse on vs. baseline are:  701


In [16]:
# THE DISTRIBUTION of RELATIONS WHEN MODEL IMPOVES OVER SPANBERT
print("THE DISTRIBUTION of RELATIONS WHEN MODEL IMPROVES OVER SPANBERT")
relations_errs_matchqids = []
for index, row in df_spanbert_only_errors.iterrows():
    relations_errs_matchqids.append(row['relation'])
print(Counter(relations_errs_matchqids))
print()

# THE DISTRIBUTION OF RELATIONS WHEN SPANBERT IMPROVES OVER MODEL
print(" THE DISTRIBUTION OF RELATIONS WHEN SPANBERT IMPROVES OVER MODEL")
relations_errs_matchqids = []
for index, row in df_spanboot_only_errors.iterrows():
    relations_errs_matchqids.append(row['relation'])
print(Counter(relations_errs_matchqids))


THE DISTRIBUTION of RELATIONS WHEN MODEL IMPROVES OVER SPANBERT
Counter({'no_relation': 566, 'per:title': 46, 'per:employee_of': 38, 'org:country_of_headquarters': 31, 'per:date_of_death': 27, 'per:cause_of_death': 20, 'per:origin': 19, 'org:city_of_headquarters': 16, 'org:top_members/employees': 16, 'per:age': 15, 'per:cities_of_residence': 12, 'org:alternate_names': 12, 'per:city_of_death': 11, 'per:countries_of_residence': 11, 'per:stateorprovince_of_death': 11, 'per:schools_attended': 11, 'org:stateorprovince_of_headquarters': 10, 'per:charges': 8, 'org:subsidiaries': 8, 'per:country_of_birth': 8, 'per:children': 8, 'org:shareholders': 7, 'per:spouse': 4, 'org:political/religious_affiliation': 4, 'per:stateorprovinces_of_residence': 4, 'org:founded_by': 4, 'per:other_family': 4, 'per:city_of_birth': 3, 'per:religion': 3, 'per:stateorprovince_of_birth': 2, 'org:founded': 2, 'per:date_of_birth': 2, 'org:parents': 2, 'per:siblings': 1, 'per:parents': 1, 'org:members': 1, 'org:number_o

### SPANBERT WORSE THAN BOOTLEG

Here we find that the *number* of proper nouns that have errors and receive bootleg mentions are about the same w/ vs. w/out using bootleg entity embeddings. 

In [17]:
# some constants
proper_noun = ['NNP', 'NNPS']
nonproper_noun = ['NN', 'NNS']
number_pos = ['CD']
nomention = ["['UNK']", "['UNK', 'UNK']", "['UNK', 'UNK', 'UNK']", "['UNK', 'UNK', 'UNK', 'UNK']", 
             "['UNK', 'UNK', 'UNK', 'UNK', 'UNK']"]
cols = ['example', 'relation','prediction_model', 'prediction_spanbert', 'prediction_spanboot', 'mentions', 'subj_pos', 'obj_pos', 'subj', 'obj', 'subj_mentions', 'subj_qids', 'obj_mentions', 'obj_qids', 'id']

In [18]:
def get_relations(subj_qids_str, obj_qids_str):
    subj_qids = ast.literal_eval(subj_qids_str)
    obj_qids = ast.literal_eval(obj_qids_str)
    subj_qid = subj_qids[0]
    obj_qid = obj_qids[0]
    
    rels = [esp.get_relation_name(r) for r in esp.get_all_relations(subj_qid, obj_qid)]
    rels = [rel_to_name.get(rel, rel) for rel in rels]
    return rels

In [19]:
def get_related_entities(qid):
    # Get all connected qids for a given qid
    related_qids = du.get_related_qids(qid, es)
    print(f"Related QIDs {related_qids}")
    print([es.get_title(qid) for qid in related_qids])

    lst = [es.get_title(qid) for qid in related_qids]

In [20]:
lst_pos_subj_spanbert = []
lst_subj_nomention_spanbert = {}
lst_pos_obj_spanbert = []
lst_obj_nomention_spanbert = {}

sub_df = df_spanbert_only_errors[cols]
for index, row in sub_df.iterrows():
    if any(pos in row['subj_pos'] for pos in proper_noun): # is the subj a proper noun?
        lst_pos_subj_spanbert.append(row['subj'])
        if any(null in row['subj_mentions'] for null in nomention): # is the bootleg mention empty for this proper noun?
            lst_subj_nomention_spanbert[row['id']] = row['subj']
            
    if any(pos in row['obj_pos'] for pos in proper_noun):
        lst_pos_obj_spanbert.append(row['obj'])
        if any(null in row['obj_mentions'] for null in nomention):
            lst_obj_nomention_spanbert[row['id']] = row['obj']
        
print("The number of proper noun subj in spanbert errors are:", len(lst_pos_subj_spanbert), "and obj are:", len(lst_pos_obj_spanbert))
print("The number of proper noun subj in spanbert errors that don't get a bootleg mention are:", len(lst_subj_nomention_spanbert.keys()), "and obj are:", len(lst_obj_nomention_spanbert.keys()))
print()

lst_pos_subj_spanbert.extend(lst_pos_obj_spanbert)
print("SPANBERT DOES WORSE THAN BOOTLEG MODEL ON THESE SUBJ/OBJ PROPER NOUNS, THIS MANY TIMES:")#print(len(lst_pos_subj_spanbert))
worst_for_spanbert = Counter(lst_pos_subj_spanbert).most_common(50)
print(worst_for_spanbert)
worst_for_spanbert = [tup[0] for tup in worst_for_spanbert]


The number of proper noun subj in spanbert errors are: 703 and obj are: 648
The number of proper noun subj in spanbert errors that don't get a bootleg mention are: 45 and obj are: 68

SPANBERT DOES WORSE THAN BOOTLEG MODEL ON THESE SUBJ/OBJ PROPER NOUNS, THIS MANY TIMES:
[("['millipore']", 19), ("['eta']", 16), ("['bipartisan', 'policy', 'center']", 16), ("['eco']", 14), ("['us']", 14), ("['julius', 'baer']", 13), ("['thomas', 'more', 'law', 'center']", 13), ("['burlington', 'northern', 'santa', 'fe', 'corp.']", 13), ("['united', 'states']", 13), ("['sasac']", 12), ("['maria', 'kaczynska']", 11), ("['global', 'infrastructure', 'partners']", 11), ("['galleon', 'group']", 11), ("['stockholm', 'international', 'water', 'institute']", 10), ("['access', 'industries']", 10), ("['rosoboronexport']", 10), ("['france']", 10), ("['mikel', 'karrera', 'sarobe']", 9), ("['freedom', 'communications']", 9), ("['washington', 'national', 'opera']", 9), ("['gwathmey']", 9), ("['pakistan']", 9), ("['arca

In [21]:
# Inspecting examples where bootleg had any relation for the subj, obj pair

count = 0
for index, row in df_spanbert_only_errors.iterrows():
    if get_relations(row['subj_qids'], row['obj_qids']):
        count += 1
print("For ", count, " spanbert only errors, bootleg has *some* relation between the subj and obj")

count = 0
for index, row in df_spanbert_only_errors.iterrows():
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    if rels and not any(rel for rel in rels if rel in row['relation']):
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_spanboot', 'prediction_spanbert', 'mentions', 'stanford_ner']])
#         print(f"Rels {rels}")
#         print()
        count += 1

print("For ", count, " spanbert only errors, the existing bootleg relation is NOT a subset of the gold relation")


For  120  spanbert only errors, bootleg has *some* relation between the subj and obj
For  104  spanbert only errors, the existing bootleg relation is NOT a subset of the gold relation


In [22]:
# alternate names
alternate_name_errs = df_spanbert_only_errors[df_spanbert_only_errors['relation'].str.contains("alternate")].shape
print("For ", alternate_name_errs[0], " spanbert errors, the gold label is ALTERNATE NAMES ")


For  12  spanbert errors, the gold label is ALTERNATE NAMES 


In [23]:
# Inspecting examples where bootleg had a relation for the subj, obj pair
# booterrors_with_relation_df = df_errors[df_errors['id'].isin(ids_missed_propernoun_with_bootrels)]
count = 0
for index, row in df_spanbert_only_errors.iterrows():
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    if rels and any(rel for rel in rels if rel in row['relation']):
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_model', 'prediction_spanbert', 'mentions']])
#         print(f"Rels {rels}")
#         print()
        count += 1
        
print("For ", count, " spanbert errors, the bootleg relation is a subset (or the same) as the gold relation")


For  16  spanbert errors, the bootleg relation is a subset (or the same) as the gold relation


In [24]:
# Inspecting examples where bootleg had a relation for the subj, obj pair
# booterrors_with_relation_df = df_errors[df_errors['id'].isin(ids_missed_propernoun_with_bootrels)]
count = 0
for index, row in df_spanbert_only_errors.iterrows():
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    if rels and any(rel for rel in rels if rel in row['prediction_spanbert']):
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_model', 'prediction_spanbert', 'mentions']])
#         print(f"Rels {rels}")
#         print()
        count += 1
        
print("For ", count, " spanbert errors, the bootleg relation is a subset (or the same) as the spanbert prediction")


For  7  spanbert errors, the bootleg relation is a subset (or the same) as the spanbert prediction


### FOR EXAMPLES WHERE SPANBERT IS BETTER THAN BOOTLEG

In [25]:
lst_pos_subj_model = []
lst_subj_nomention_model = {}
lst_pos_obj_model = []
lst_obj_nomention_model= {}

sub_df = df_spanboot_only_errors[cols]
for index, row in sub_df.iterrows():
    if any(pos in row['subj_pos'] for pos in proper_noun): # is the subj a proper noun?
        lst_pos_subj_model.append(row['subj'])
        if any(null in row['subj_mentions'] for null in nomention): # is the bootleg mention empty for this proper noun?
            lst_subj_nomention_model[row['id']] = row['subj']
            
    if any(pos in row['obj_pos'] for pos in proper_noun):
        lst_pos_obj_model.append(row['obj'])
        if any(null in row['obj_mentions'] for null in nomention):
            lst_obj_nomention_model[row['id']] = row['obj']
        
print("The number of proper noun subj in bootleg (&not spanbert) errors are:", len(lst_pos_subj_spanbert), "and obj are:", len(lst_pos_obj_spanbert))
print("The number of proper noun subj in bootleg (&not spanbert) errors that don't get a bootleg mention are:", len(lst_subj_nomention_spanbert.keys()), "and obj are:", len(lst_obj_nomention_spanbert.keys()))
print()

lst_pos_subj_model.extend(lst_pos_obj_model)
print("SPANBERT DOES BETTER THAN BOOTLEG MODEL ON THESE SUBJ/OBJ PROPER NOUNS")#print(len(lst_pos_subj_spanbert))
worst_for_model = Counter(lst_pos_subj_model).most_common(50)
print(worst_for_model)
worst_for_model = [tup[0] for tup in worst_for_model]

The number of proper noun subj in bootleg (&not spanbert) errors are: 1351 and obj are: 648
The number of proper noun subj in bootleg (&not spanbert) errors that don't get a bootleg mention are: 45 and obj are: 68

SPANBERT DOES BETTER THAN BOOTLEG MODEL ON THESE SUBJ/OBJ PROPER NOUNS
[("['eco']", 31), ("['millipore']", 18), ("['global', 'infrastructure', 'partners']", 16), ("['access', 'industries']", 15), ("['maria', 'kaczynska']", 13), ("['galleon', 'group']", 11), ("['tahawwur', 'hussain', 'rana']", 11), ("['eta']", 11), ("['rosoboronexport']", 10), ("['u.s.']", 10), ("['burlington', 'northern', 'santa', 'fe', 'corp.']", 9), ("['clifton']", 8), ("['mohammed', 'sayed', 'tantawi']", 8), ("['freedom', 'communications']", 8), ("['access', 'industries', 'inc.']", 8), ("['dj', 'am']", 8), ("['len', 'blavatnik']", 8), ("['stuart', 'rose']", 7), ("['united', 'steelworkers']", 7), ("['douglas', 'flint']", 7), ("['orlando', 'zapata']", 7), ("['escada']", 7), ("['adam', 'goldstein']", 7), ("[

In [26]:
# model_worse_than_spanbert = [tup for tup in worst_for_model if tup not in worst_for_spanbert]
# for item in model_worse_than_spanbert:
#     item = ast.literal_eval(item)
#     item = ' '.join(item)
#     cands = get_candidates(item)
#     print("Cands for ", item, " are: ", f"Cands {cands}", "\n")

In [27]:
sub_df = df_spanboot_only_errors[cols] #BOOTLEG ERRORS, and NOT BASELINE ERRORS
count_missed_propernoun = 0
count_missed_propernoun_with_bootrels = 0
ids_missed_propernoun_with_bootrels = []
for index, row in sub_df.iterrows():
    if any(pos in row['subj_pos'] for pos in proper_noun) or any(pos in row['obj_pos'] for pos in proper_noun):
#        print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_model', 'prediction_spanbert', 'mentions']])
        rels = get_relations(row['subj_qids'], row['obj_qids'])
        if rels:
            count_missed_propernoun_with_bootrels += 1
            ids_missed_propernoun_with_bootrels.append(row['id'])
#         print(f"Rels {rels}")
#         print()
        count_missed_propernoun += 1
        
print("Number of examples where bootleg had a relation for the subj, obj pair: ", count_missed_propernoun_with_bootrels)
print("Number of examples where subj and/or obj is a proper noun: ", count_missed_propernoun)


Number of examples where bootleg had a relation for the subj, obj pair:  104
Number of examples where subj and/or obj is a proper noun:  649


In [28]:
# Inspecting examples where bootleg had any relation for the subj, obj pair
# booterrors_with_relation_df = df_errors[df_errors['id'].isin(ids_missed_propernoun_with_bootrels)]
count = 0
for index, row in df_spanboot_only_errors.iterrows():
    if get_relations(row['subj_qids'], row['obj_qids']):
        count += 1
print("For ", count, " bootleg errors, bootleg has *some* relation between the subj and obj\n")


print("For these bootleg errors, the existing bootleg relation is NOT a string-subset of the gold relation")
count = 0
for index, row in df_spanboot_only_errors.iterrows():
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    if rels and not any(rel for rel in rels if rel in row['relation']):
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_model', 'prediction_spanbert', 'mentions']])
#         print(f"Rels {rels}")
#         print()
        count += 1

print("For ", count, " bootleg errors, the existing bootleg relation is NOT a subset of the gold relation")


For  104  bootleg errors, bootleg has *some* relation between the subj and obj

For these bootleg errors, the existing bootleg relation is NOT a string-subset of the gold relation
For  93  bootleg errors, the existing bootleg relation is NOT a subset of the gold relation


In [29]:
# Inspecting examples where bootleg had a relation for the subj, obj pair
# booterrors_with_relation_df = df_errors[df_errors['id'].isin(ids_missed_propernoun_with_bootrels)]
count = 0
for index, row in df_spanboot_only_errors.iterrows():
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    if rels and any(rel for rel in rels if rel in row['relation']):
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_model', 'prediction_spanbert', 'mentions']])
#         print(f"Rels {rels}")
#         print()
        count += 1
        
print("For ", count, " bootleg errors, the bootleg relation is a subset (or the same) as the gold relation")


For  11  bootleg errors, the bootleg relation is a subset (or the same) as the gold relation


In [30]:
# Inspecting examples where bootleg had a relation for the subj, obj pair
# booterrors_with_relation_df = df_errors[df_errors['id'].isin(ids_missed_propernoun_with_bootrels)]
count = 0
for index, row in df_errors.iterrows():
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    if rels and any(rel for rel in rels if rel in row['prediction_model']):
#         print(row[['example', 'subj', 'obj', 'subj_qids', 'obj_qids', 'relation', 'prediction_model', 'prediction_spanbert', 'mentions']])
#         print(f"Rels {rels}")
#         print()
        count += 1
        
print("For ", count, " bootleg errors, the bootleg relation is a subset (or the same) as the bootleg prediction")


For  6  bootleg errors, the bootleg relation is a subset (or the same) as the bootleg prediction


#### For these proper nouns that don't get a bootleg mention, is it because they are not in bootleg's database?

In [31]:
#print(lst_subj_nomention_model)

no_candidates_values = []
has_candidates_values = []
# alias is the subj/obj that did not receive a bootleg mention
for idx, value in lst_subj_nomention_model.items():
    row = df_results[df_results['id'] == idx]   
    value = ast.literal_eval(value)
    alias = " ".join(value)
    try:
        cands = esp.get_qid_cands(alias)
        #print("{}\n{}\n{}\n\n".format(row['example'], alias, f"Cands {cands}") # uncomment to view the alias in context downstream
        has_candidates_values.append(alias)
    except: 
        no_candidates_values.append(alias)

print("Of these values that received no bootleg mention,", len(has_candidates_values), "(total)/", len(set(has_candidates_values)), "(unique), ARE IN the bootleg database.")

######## THIS SET OF ENTITIES SIMPLY DON'T HAVE CANDIDATES FOR THEM IN BOOTLEG! WE SHOULD ADD THEM! ############
print("Of these values that received no bootleg mention,", len(no_candidates_values), "(total)/", len(set(no_candidates_values)), "(unique), ARE NOT IN the bootleg database.")
print()
print(set(no_candidates_values))


Of these values that received no bootleg mention, 3 (total)/ 2 (unique), ARE IN the bootleg database.
Of these values that received no bootleg mention, 37 (total)/ 9 (unique), ARE NOT IN the bootleg database.

{'sarobe', 'millipore', 'economic cooperation organisation', 'millipore corp', 'southgobi energy resources', 'attenti holdings', 'chinaco', 'election complaints commission', 'millipore corp.'}


#### Error rates for words

In [32]:
nomention_model_not_spanbert = []
sub_df = df_spanbert_only_errors[cols]
for index, row in sub_df.iterrows():
    qids = row['mentions']
    qids = ast.literal_eval(qids)
    words = row['example'].split(' ')
    for i in range(len(words)):
        if qids[i] == 'UNK' and words[i] not in stop_words and words[i] not in punctuation:
            nomention_model_not_spanbert.append(words[i])

print("WORSE FOR SPANBERT THAN MODEL")
ctr_nomention = Counter(nomention_model_not_spanbert).most_common(50)
print(ctr_nomention)


nomention_spanbert_or_model = []
sub_df = df_spanboot_only_errors[cols]
for index, row in sub_df.iterrows():
    qids = row['mentions']
    qids = ast.literal_eval(qids)
    words = row['example'].split(' ')
    for i in range(len(words)):
        if qids[i] == 'UNK' and words[i] not in stop_words and words[i] not in punctuation:
            nomention_spanbert_or_model.append(words[i])
            
print("WORSE FOR MODEL THAN SPANBERT")
print(Counter(nomention_spanbert_or_model).most_common(50))


WORSE FOR SPANBERT THAN MODEL
[('said', 245), ('-rrb-', 119), ('-lrb-', 113), ('died', 66), ('former', 48), ('$', 48), ('billion', 46), ('company', 45), ('president', 44), ('also', 43), ('two', 41), ('group', 40), ('last', 40), ('years', 38), ('year', 37), ('home', 37), ('would', 37), ('million', 36), ('one', 35), ('percent', 35), ('new', 32), ('cancer', 29), ('became', 29), ('family', 29), ('told', 29), ('including', 28), ('found', 27), ('death', 27), ('spokesman', 27), ('son', 25), ('three', 25), ('statement', 25), ('millipore', 25), ('us', 25), ('sunday', 25), ('senior', 24), ('career', 24), ('known', 24), ('ago', 23), ('monday', 23), ('leader', 23), ('whose', 23), ('director', 23), ('world', 23), ('father', 22), ('wife', 22), ('tuesday', 22), ('wednesday', 21), ('firm', 21), ('first', 21)]
WORSE FOR MODEL THAN SPANBERT
[('said', 172), ('-rrb-', 77), ('-lrb-', 75), ('including', 45), ('billion', 44), ('last', 39), ('president', 39), ('years', 39), ('members', 38), ('died', 36), ('on

In [33]:
errates_model = {}
for index, row in df_results.iterrows():
    err = 0
    if row['relation'] != row['prediction_spanboot']:
        err = 1
    words = row['example'].split(' ')
    for i in range(len(words)):
        if words[i] not in stop_words and words[i] not in punctuation:
            if words[i] in errates_model:
                errates_model[words[i]]['count'] += 1
                errates_model[words[i]]['errs'] += err
            else:
                errates_model[words[i]] = {}
                errates_model[words[i]]['count'] = 1
                errates_model[words[i]]['errs'] = err

for k, v in errates_model.items():
    errates_model[k]['errrate'] = errates_model[k]['errs']/errates_model[k]['count'] 
    if errates_model[k]['errs'] > 30:
        print(k, v)


time {'count': 600, 'errs': 40, 'errrate': 0.06666666666666667}
chief {'count': 895, 'errs': 117, 'errrate': 0.13072625698324022}
financial {'count': 345, 'errs': 32, 'errrate': 0.0927536231884058}
chairman {'count': 578, 'errs': 45, 'errrate': 0.07785467128027682}
government {'count': 589, 'errs': 48, 'errrate': 0.08149405772495756}
u.s. {'count': 782, 'errs': 81, 'errrate': 0.10358056265984655}
bank {'count': 778, 'errs': 50, 'errrate': 0.06426735218508997}
julius {'count': 308, 'errs': 35, 'errrate': 0.11363636363636363}
baer {'count': 331, 'errs': 37, 'errrate': 0.11178247734138973}
former {'count': 993, 'errs': 83, 'errrate': 0.08358509566968782}
paris {'count': 266, 'errs': 41, 'errrate': 0.15413533834586465}
french {'count': 579, 'errs': 45, 'errrate': 0.07772020725388601}
media {'count': 289, 'errs': 32, 'errrate': 0.11072664359861592}
reported {'count': 422, 'errs': 40, 'errrate': 0.0947867298578199}
found {'count': 445, 'errs': 47, 'errrate': 0.10561797752808989}
dead {'count

In [34]:
errates_spanbert = {}
for index, row in df_results.iterrows():
    err = 0
    if row['relation'] != row['prediction_spanbert']:
        err = 1
    words = row['example'].split(' ')
    for i in range(len(words)):
        if words[i] not in punctuation and words[i] not in stop_words:
            if words[i] in errates_spanbert:
                errates_spanbert[words[i]]['count'] += 1
                errates_spanbert[words[i]]['errs'] += err
            else:
                errates_spanbert[words[i]] = {}
                errates_spanbert[words[i]]['count'] = 1
                errates_spanbert[words[i]]['errs'] = err

for k, v in errates_model.items():
    if k in errates_spanbert:
        spanbert_error_ct = errates_spanbert[k]['errs']
    else:
        spanbert_error_ct = 0
    
    spanboot_error_ct = errates_model[k]['errs']
    if (spanboot_error_ct-spanbert_error_ct) >= 5 and spanboot_error_ct>8:
        print("{}: spanbert_errs {}; spanboot_errs {}".format(k, spanbert_error_ct, spanboot_error_ct))
#     elif abs(spanboot_error_ct) > 1:
#         print(errates_model[k])
#         print(errates_spanbert[k])

chief: spanbert_errs 107; spanboot_errs 117
stephen: spanbert_errs 5; spanboot_errs 10
accused: spanbert_errs 20; spanboot_errs 25
inc.: spanbert_errs 44; spanboot_errs 53
film: spanbert_errs 11; spanboot_errs 19
start: spanbert_errs 23; spanboot_errs 30
executive: spanbert_errs 83; spanboot_errs 93
death: spanbert_errs 80; spanboot_errs 85
appointment: spanbert_errs 16; spanboot_errs 23
middle: spanbert_errs 8; spanboot_errs 13
workers: spanbert_errs 23; spanboot_errs 29
including: spanbert_errs 65; spanboot_errs 82
50: spanbert_errs 15; spanboot_errs 21
since: spanbert_errs 56; spanboot_errs 61
dissident: spanbert_errs 14; spanboot_errs 20
tamayo: spanbert_errs 8; spanboot_errs 13
de: spanbert_errs 18; spanboot_errs 23
february: spanbert_errs 21; spanboot_errs 28
23: spanbert_errs 20; spanboot_errs 26
british: spanbert_errs 32; spanboot_errs 45
wife: spanbert_errs 68; spanboot_errs 80
plotting: spanbert_errs 3; spanboot_errs 9
hotels: spanbert_errs 17; spanboot_errs 24
maria: spanber

# Just Dig into Model Errors

### For models that used manual aliases, what happened on alias-examples

In [35]:
aliases_to_add = {}
aliases_to_add['Millipore'] = {'alias':'merck millipore', 'qid':'Q1669719', 'len':1}
aliases_to_add['US '] = {'alias':'united states', 'qid':'Q30', 'len':1}
aliases_to_add['U.S.'] = {'alias':'united states', 'qid':'Q30', 'len':1}
aliases_to_add['WNO'] = {'alias':"washington national opera", 'qid':'Q386613', 'len':1}

for k, v in errates_model.items():
    for alias, content in aliases_to_add.items():
        if alias.lower() in k:  
            if k in errates_spanbert:
                spanbert_error_ct = errates_spanbert[k]['errs']
            else:
                spanbert_error_ct = 0
            spanboot_error_ct = errates_model[k]['errs']
            print("{}: spanbert_errs {}; spanboot_errs {}".format(k, spanbert_error_ct, spanboot_error_ct))
            

u.s.: spanbert_errs 86; spanboot_errs 81
u.s.-based: spanbert_errs 2; spanboot_errs 2
millipore: spanbert_errs 58; spanboot_errs 58
wno: spanbert_errs 5; spanboot_errs 2
http://www.millipore.com/edm/files/images/$file/spacergraphic.gif: spanbert_errs 0; spanboot_errs 0
u.s.-born: spanbert_errs 1; spanboot_errs 1
u.s.-iraqi: spanbert_errs 0; spanboot_errs 0
china-u.s.: spanbert_errs 0; spanboot_errs 1
u.s.a.: spanbert_errs 0; spanboot_errs 0
u.s.-run: spanbert_errs 0; spanboot_errs 0
anti-u.s.: spanbert_errs 0; spanboot_errs 0


In [36]:
for ind, row in df_errors_spanboot.iterrows():
    if ' he ' in row['example']:
        print(row[cols])
        print()

example                a former us ambassador and decorated marine corps veteran of the world war ii battle at iwo jima , marshall was accused of callously cheating and lying to his mother about her finances which he appropriated to live a lavish lifestyle .                                                                                                                                                                                                                               
relation               per:origin                                                                                                                                                                                                                                                                                                                                                                                                                                                                
prediction_model       no_relation  

example                `` al-hakim was a big brother and a strong supporter during the struggle against the former regime , and he was a major player in the process of building the new iraq , '' prime minister nouri al-maliki said in a statement .                                                                                                                                                                                                                  
relation               per:origin                                                                                                                                                                                                                                                                                                                                                                                                                                        
prediction_model       per:countries_of_residence                                   

example                last january , he told the magazine paris match : `` i do not want to die at 50 years .                                                                            
relation               no_relation                                                                                                                                                        
prediction_model       no_relation                                                                                                                                                        
prediction_spanbert    no_relation                                                                                                                                                        
prediction_spanboot    per:age                                                                                                                                                            
mentions               ['UNK', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK',

In [37]:
count = 0
ids_spanboot = []
pronouns = ['he', 'she', 'her', 'his', 'him']
for index, row in df_errors_spanboot.iterrows():
    subj = ast.literal_eval(row['subj'])
    obj = ast.literal_eval(row['obj'])
    if any(s for s in subj if s in pronouns) or any(o for o in obj if o in pronouns):
        print(row[cols])
        ids_spanboot.append(row['id'])
        count += 1
        
count
      


example                he is survived by his third wife , former television news correspondent marilyn berger ; his sons , steven and jeffrey ; his daughter , lisa cassara ; his stepdaughter , jilian childers hewitt , whom hewitt adopted ; and three grandchildren .                                                                                                                                                                                             
relation               no_relation                                                                                                                                                                                                                                                                                                                                                                                                                                    
prediction_model       no_relation                                                        

example                she also had success with barry as the duo the raindrops with the songs `` what a guy '' and `` the kind of boy you ca n't forget . ''                                                                                                                                                             
relation               per:employee_of                                                                                                                                                                                                                                                                                    
prediction_model       no_relation                                                                                                                                                                                                                                                                                        
prediction_spanbert    per:employee_of                 

example                we have lost a great man and a great artist , but we celebrate his extraordinary life , his art , and the dancers and the artists with whom he worked , '' said judith fishman , chairman of the cunningham dance foundation .                                                                                                                                                          
relation               per:title                                                                                                                                                                                                                                                                                                                                                                               
prediction_model       no_relation                                                                                                                                                                      

example                hariri is a business graduate of georgetown university in washington and heads his late father 's saudi-based construction firm , saudi oger .                                                                                          
relation               org:top_members/employees                                                                                                                                                                                                               
prediction_model       no_relation                                                                                                                                                                                                                             
prediction_spanbert    no_relation                                                                                                                                                                                                      

example                sixty-two year-old dudu topaz is a household name in israel for his popular variety shows and decades-long showbiz career , although his career has been on the wane .                                                                                                              
relation               no_relation                                                                                                                                                                                                                                                                         
prediction_model       no_relation                                                                                                                                                                                                                                                                         
prediction_spanbert    per:age                                                                      

399

In [38]:
# count = 0
# ids_spanboot = []
# for index, row in df_errors_spanboot.iterrows():
#     if row['relation'] == 'per:age':
#         row_spanbert = df_errors_spanbert[row['id']].item()
#         row_spanbert['pred']
            
        
# count

In [39]:
count = 0
ids_spanbert = []
pronouns = ['he', 'she', 'her', 'his', 'him']
for index, row in df_errors_spanbert.iterrows():
    subj = ast.literal_eval(row['subj'])
    obj = ast.literal_eval(row['obj'])
    if any(s for s in subj if s in pronouns) or any(o for o in obj if o in pronouns):
        # print(row[cols])
        ids_spanbert.append(row['id'])
        count += 1
        
count
      

514

In [40]:
print(len(list(set(ids_spanboot) - set(ids_spanbert))))
print(len(list(set(ids_spanbert) - set(ids_spanboot))))

137
252


# Tacred paper insights

### For different relations, what is the error rate with and without using Bootleg embeddings?

In [41]:
count_boot = 0
for index, row in df_errors_spanboot.iterrows():
    if get_relations(row['subj_qids'], row['obj_qids']):
        count_boot += 1
        
count_bert = 0
for index, row in df_errors_spanbert.iterrows():
    if get_relations(row['subj_qids'], row['obj_qids']):
        count_bert += 1
        
print("For {}/{} errors, spanboot has *some* relation between the subj and obj\n".format(count_boot, df_results_spanboot.shape[0]))
print("For {}/{} errors, spanbert has *some* relation between the subj and obj\n".format(count_bert, df_results_spanbert.shape[0]))


For 328/22631 errors, spanboot has *some* relation between the subj and obj

For 344/22631 errors, spanbert has *some* relation between the subj and obj



### If subj and/or obj in tacred example has a Bootleg type, what are the error rates?

In [42]:
import json
typeqid_to_id = json.load(open("/dfs/scratch0/lorr1/bootleg/embs/wikidata_to_typeid.json"))

wikidata_types_wiki_filt_to_id = json.load(open('/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/bootleg_downloads/emb_data/wikidata_types_wiki_filt.json'))

def get_type_ids(df):
    type_dict = {}
    
    # analysis structures
    row_ids_subj = []
    row_ids_obj = []
    exists_subj = 0
    subj_not_mapped = 0
    exists_obj = 0
    obj_not_mapped = 0
    not_exists = 0
    not_in_map_list = []
    
    for ind, row in df.iterrows():
        subj_qids = ast.literal_eval(row['subj_qids'])
        obj_qids = ast.literal_eval(row['obj_qids'])
        idx = row['id']
                     
        # get the first type in the types list (TODO, better way to do this in the future!)    
        if subj_qids[0] and subj_qids[0] != 'UNK':
            if subj_qids[0] in wikidata_types_wiki_filt_to_id:
                type_subj_id_lst = wikidata_types_wiki_filt_to_id[subj_qids[0]]
                if type_subj_id_lst:
                    row_ids_subj.append(idx)
                    exists_subj +=1
                else:
                    not_exists += 1
            else:
                subj_not_mapped += 1
                # print(subj_qids[0])

        if obj_qids[0] and obj_qids[0] != 'UNK':
            if obj_qids[0] in wikidata_types_wiki_filt_to_id:
                type_obj_id_lst = wikidata_types_wiki_filt_to_id[obj_qids[0]]
                if type_obj_id_lst:
                    row_ids_obj.append(idx)
                    exists_obj +=1
                else:
                    not_exists += 1
            else:
                obj_not_mapped += 1
                # print(obj_qids[0])

    print("The number of subjects with a type relation: ", exists_subj)
    print("The number of objects with a type relation: ",exists_obj)   
    
    return row_ids_subj, row_ids_obj


In [43]:
row_ids_subj, row_ids_obj = get_type_ids(df_results_spanboot)

The number of subjects with a type relation:  15671
The number of objects with a type relation:  13111


In [44]:
subj_df = df_results[df_results['id'].isin(row_ids_subj)]

tot_subj = subj_df.shape
spanbert_errs = 0
spanboot_errs = 0
for ind, row in subj_df.iterrows():
    if row['relation'] != row['prediction_spanbert']:
        spanbert_errs += 1
    if row['relation'] != row['prediction_spanboot']:
        spanboot_errs += 1
        
print("For {}/{} subj_df examples, spanboot makes an error".format(spanboot_errs, tot_subj))
print("For {}/{} subj_df examples, spanbert makes an error".format(spanbert_errs, tot_subj))


For 1408/(15671, 28) subj_df examples, spanboot makes an error
For 1542/(15671, 28) subj_df examples, spanbert makes an error


In [45]:
obj_df = df_results[df_results['id'].isin(row_ids_obj)]

tot_obj = obj_df.shape
spanbert_errs = 0
spanboot_errs = 0
for ind, row in obj_df.iterrows():
    if row['relation'] != row['prediction_spanbert']:
        spanbert_errs += 1
    if row['relation'] != row['prediction_spanboot']:
        spanboot_errs += 1
        
print("For {}/{} obj_df examples, spanboot makes an error".format(spanboot_errs, tot_obj))
print("For {}/{} obj_df examples, spanbert makes an error".format(spanbert_errs, tot_obj))


For 1418/(13111, 28) obj_df examples, spanboot makes an error
For 1601/(13111, 28) obj_df examples, spanbert makes an error


In [46]:
# either the subj/obj/both have a type!

all_subj_and_obj_ids = row_ids_subj
all_subj_and_obj_ids.extend(row_ids_obj) 
all_subj_and_obj_ids = set(all_subj_and_obj_ids)
all_type_df = df_results[df_results['id'].isin(all_subj_and_obj_ids)]

tot_type = all_type_df.shape
spanbert_errs = 0
spanboot_errs = 0
for ind, row in all_type_df.iterrows():
    if row['relation'] != row['prediction_spanbert']:
        spanbert_errs += 1
    if row['relation'] != row['prediction_spanboot']:
        spanboot_errs += 1
        
print("For {}/{} subj_df examples, spanboot makes an error".format(spanboot_errs, tot_type))
print("For {}/{} subj_df examples, spanbert makes an error".format(spanbert_errs, tot_type))
print("Error Rate SpanBERT/SpanBOOT = {}".format((spanbert_errs/tot_type[0])-(spanboot_errs/tot_type[0])))

For 1774/(19309, 28) subj_df examples, spanboot makes an error
For 1981/(19309, 28) subj_df examples, spanbert makes an error
Error Rate SpanBERT/SpanBOOT = 0.010720389455694243


In [47]:
print(1778/19384)
print(df_errors_spanboot.shape[0]/df_results.shape[0])

0.09172513413124227
0.0838230745437674


#### alternate names case

In [48]:
#SPANBOOT
count_boot_errors = 0
relations_errs_matchqids = []
for index, row in df_errors_spanboot.iterrows():
    # subj qid
    subj_qids_str = row['subj_qids']
    subj_qids = ast.literal_eval(subj_qids_str)
    subj_qid = subj_qids[0]
    
    # obj qid
    obj_qids_str = row['obj_qids']
    obj_qids = ast.literal_eval(obj_qids_str)
    obj_qid = obj_qids[0]
    
    if subj_qid == obj_qid and subj_qid != 'UNK':
        count_boot_errors += 1
        relations_errs_matchqids.append(row['relation'])
print(Counter(relations_errs_matchqids))

#SPANBERT
count_bert_errors = 0
relations_errs_matchqids = []
for index, row in df_errors_spanbert.iterrows():
    # subj qid
    subj_qids_str = row['subj_qids']
    subj_qids = ast.literal_eval(subj_qids_str)
    subj_qid = subj_qids[0]
    
    # obj qid
    obj_qids_str = row['obj_qids']
    obj_qids = ast.literal_eval(obj_qids_str)
    obj_qid = obj_qids[0]
    
    if subj_qid == obj_qid and subj_qid != 'UNK':
        count_bert_errors += 1
        relations_errs_matchqids.append(row['relation'])
print(Counter(relations_errs_matchqids))

count_results = 0
relations_matchqids = []
for index, row in df_results.iterrows():
    # subj qid
    subj_qids_str = row['subj_qids']
    subj_qids = ast.literal_eval(subj_qids_str)
    subj_qid = subj_qids[0]
    
    # obj qid
    obj_qids_str = row['obj_qids']
    obj_qids = ast.literal_eval(obj_qids_str)
    obj_qid = obj_qids[0]
    
    if subj_qid == obj_qid and subj_qid != 'UNK':
        count_results += 1
        relations_matchqids.append(row['relation'])
        
print(Counter(relations_matchqids))
count_results

print("total instances of two matching qids subj and obj: ", count_results)
print("BOOT errors: ", count_boot_errors)
print("BERT errors: ", count_bert_errors)

Counter({'no_relation': 34, 'org:alternate_names': 13, 'per:alternate_names': 8, 'org:subsidiaries': 7, 'org:parents': 3, 'per:parents': 1, 'org:founded_by': 1})
Counter({'no_relation': 39, 'org:alternate_names': 18, 'org:subsidiaries': 5, 'org:parents': 3, 'org:founded_by': 1, 'per:cities_of_residence': 1, 'per:alternate_names': 1})
Counter({'org:alternate_names': 187, 'no_relation': 181, 'org:subsidiaries': 15, 'per:alternate_names': 12, 'per:children': 6, 'per:parents': 4, 'org:parents': 3, 'org:founded_by': 1, 'per:employee_of': 1, 'per:spouse': 1, 'per:cities_of_residence': 1})
total instances of two matching qids subj and obj:  412
BOOT errors:  67
BERT errors:  68


In [65]:
#SPANBOOT
count_boot_errors = 0
relations_errs_matchqids = []
sub_df_boot = df_errors_spanboot[df_errors_spanboot['relation'] == 'org:alternate_names']
sub_df_bert = df_errors_spanbert[df_errors_spanbert['relation'] == 'org:alternate_names']
sub_df_total = df_results[df_results['relation'] == 'org:alternate_names']

print("Total Alternate Names ", sub_df_total.shape)
print("BOOT errors: ", sub_df_boot.shape)
print("BERT errors: ", sub_df_bert.shape)

print(sub_df_bert.shape[0]/sub_df_boot.shape[0])

# not a match for bootleg's two mentions extracted in error examples
#sub_df_boot[['subj_mentions', 'obj_mentions']]
# count_not_equal = 0
# for ind, row in sub_df_boot.iterrows():
#     subj_mentions = ast.literal_eval(row['subj_mentions'])
#     obj_mentions = ast.literal_eval(row['obj_mentions'])
#     if subj_mentions[0] != obj_mentions[0]:
#         count_not_equal += 1
#         print(row['subj'], row['obj'])
# print(count_not_equal)


# a match for bootleg's two mentions across ALL EXAMPLES
count_equal = 0
for ind, row in sub_df_total.iterrows():
    subj_mentions = ast.literal_eval(row['subj_mentions'])
    obj_mentions = ast.literal_eval(row['obj_mentions'])
    if subj_mentions[0] == obj_mentions[0]:
        count_equal += 1
        #print(row['subj'], row['obj'])
print(count_equal)

Total Alternate Names  (348, 28)
BOOT errors:  (33, 28)
BERT errors:  (38, 28)
1.1515151515151516
187


### Error rate by number of entities in the Bootleg example

In [None]:
import statistics

In [108]:
def split_by_median(dict_of_errors, median):
    lt_median_count = 0
    mt_median_count = 0
    lt_median_error = 0
    mt_median_error = 0
    for k, v in dict_of_errors.items():
        if k < median:
            lt_median_count += v['count']
            lt_median_error += v['error']
        else:
            mt_median_count += v['count']
            mt_median_error += v['error']
    print("Counts: ", lt_median_count, mt_median_count)
    print(lt_median_error/lt_median_count, mt_median_error/mt_median_count)
    
    lt_median_len = []
    mt_median_len = []
    for k, v in dict_of_errors.items():
        if k < median:
            lt_median_len.append(v['len'])
        else:
            mt_median_len.append(v['len'])
            
    print(sum(lt_median_len)/lt_median_count, sum(mt_median_len)/mt_median_count)
    

In [110]:
dict_of_errors_spanbert = {}
num_mentions_spanbert = []

dict_of_errors_spanboot = {}
num_mentions_spanboot = []

for index, row in df_results.iterrows():
    
    
    words = row['example'].split(' ')
    word_count = len([word for word in words if word not in stop_words and word not in punctuation])
    mentions = ast.literal_eval(row['qids'])
    non_unk_mentions = len([ment for ment in mentions if ment != 'UNK' and ment != 'NF'])/word_count
    subj_qids = row['subj_qids']
    obj_qids = row['obj_qids']
    rels = get_relations(row['subj_qids'], row['obj_qids'])
    has_rel = 0 
    if rels:
        has_rel = 0
    
    err = 0
    if row['relation'] != row['prediction_spanbert']:
        err = 1
    if non_unk_mentions in dict_of_errors_spanbert:
        dict_of_errors_spanbert[non_unk_mentions]['count'] += 1
        dict_of_errors_spanbert[non_unk_mentions]['error'] += err
        dict_of_errors_spanbert[non_unk_mentions]['len'] += len(words)
    else:
        dict_of_errors_spanbert[non_unk_mentions] = {}
        dict_of_errors_spanbert[non_unk_mentions]['count'] = 1
        dict_of_errors_spanbert[non_unk_mentions]['error'] = err
        dict_of_errors_spanbert[non_unk_mentions]['len'] = len(words)
        
    num_mentions_spanbert.append(non_unk_mentions)
    
    err = 0
    if row['relation'] != row['prediction_spanboot']:
        err = 1
    if non_unk_mentions in dict_of_errors_spanboot:
        dict_of_errors_spanboot[non_unk_mentions]['count'] += 1
        dict_of_errors_spanboot[non_unk_mentions]['error'] += err
        dict_of_errors_spanboot[non_unk_mentions]['len'] += len(words)
    else:
        dict_of_errors_spanboot[non_unk_mentions] = {}
        dict_of_errors_spanboot[non_unk_mentions]['count'] = 1
        dict_of_errors_spanboot[non_unk_mentions]['error'] = err
        dict_of_errors_spanboot[non_unk_mentions]['len'] = len(words)
    num_mentions_spanboot.append(non_unk_mentions)


In [109]:
median_boot = statistics.median(num_mentions_spanboot)
split_by_median(dict_of_errors_spanboot, median_boot)
print(median_boot)

Counts:  11296 11335
0.07604461756373938 0.09157476841640935
35.18440155807365 35.740185266872516
0.4482758620689655


In [111]:
median_bert = statistics.median(num_mentions_spanbert)
split_by_median(dict_of_errors_spanbert, median_bert)
print(median_bert)

Counts:  11296 11335
0.08640226628895184 0.1031318923687693
35.18440155807365 35.740185266872516
0.4482758620689655


In [119]:
print("The error gap between the spanbert and spanboot model is 11.6 % higher on the slice with an above-median number of entities.")
print((0.1031318923687693-0.09157476841640935)/(0.08640226628895184-0.07604461756373938))

The error gap between the spanbert and spanboot model is 11.6 % higher on the slice with an above-median number of entities.
1.1158057450073335


In [112]:
# error_rates = {}
# for k, v in dict_of_errors.items():
#     error_rates[k] = v['error']/v['count']
# error_rates = {k: v for k, v in sorted(error_rates.items(), key=lambda item: item[0], reverse=True)}


In [113]:
# import matplotlib.pylab as plt
# plt.figure()
# lists = error_rates.items() # return a list of tuples
# x, y = zip(*lists) # unpack a list of pairs into two tuples
# plt.plot(x, y)
# plt.show()

### Now for number of relations

In [124]:
rel_errors_spanbert = {}
num_rels_spanbert = []

rel_errors_spanboot = {}
num_rels_spanboot = []

for index, row in df_results.iterrows():
    qids = ast.literal_eval(row['qids'])
    
    rels_count = 0
    total_pairs = 0
    for i in range(len(qids)):
        if qids[i] != 'UNK':
            for j in range(len(qids)):
                if i < j and qids[j] != 'UNK':
                    rels = [esp.get_relation_name(r) for r in esp.get_all_relations(qids[i], qids[j])]
#                     if len(rels) > 1:
#                         print(rels)
                    if rels:
                        rels_count += len(rels)
                    total_pairs += 1
    err = 0
    if row['relation'] != row['prediction_spanbert']:
        err = 1
    if rels_count in rel_errors_spanbert:
        rel_errors_spanbert[rels_count]['count'] += 1
        rel_errors_spanbert[rels_count]['error'] += err
        rel_errors_spanbert[rels_count]['len'] += total_pairs
    else:
        rel_errors_spanbert[rels_count] = {}
        rel_errors_spanbert[rels_count]['count'] = 1
        rel_errors_spanbert[rels_count]['error'] = err
        rel_errors_spanbert[rels_count]['len'] = total_pairs
    num_rels_spanbert.append(rels_count)
    
    err = 0
    if row['relation'] != row['prediction_spanboot']:
        err = 1
    if rels_count in rel_errors_spanboot:
        rel_errors_spanboot[rels_count]['count'] += 1
        rel_errors_spanboot[rels_count]['error'] += err
        rel_errors_spanboot[rels_count]['len'] += total_pairs
    else:
        rel_errors_spanboot[rels_count] = {}
        rel_errors_spanboot[rels_count]['count'] = 1
        rel_errors_spanboot[rels_count]['error'] = err
        rel_errors_spanboot[rels_count]['len'] = total_pairs
    num_rels_spanboot.append(rels_count)
    

In [125]:
median_boot = statistics.median(num_rels_spanboot)
split_by_median(rel_errors_spanboot, median_boot)
print(median_boot)

Counts:  11266 11365
0.06275519261494764 0.10470743510778707
32.76761938576247 70.85094588649362
1


In [126]:
median_bert = statistics.median(num_rels_spanbert)
split_by_median(rel_errors_spanbert, median_bert)
print(median_bert)

Counts:  11266 11365
0.08086277294514468 0.10857897052353717
32.76761938576247 70.85094588649362
1


### conditioning on the train-set popularity of a subj/obj

In [None]:
with open('/dfs/scratch1/simran/bootleg_downstream/error_outputs/subj_counts.json', 'r') as fp:
    train_subj_counts = json.load(fp)
    
with open('/dfs/scratch1/simran/bootleg_downstream/error_outputs/obj_counts.json', 'r') as fp:
    train_obj_counts = json.load(fp)

In [None]:
train_subj_counts = {k: v for k, v in sorted(train_subj_counts.items(), key=lambda item: item[1], reverse=True)}
print(len(train_subj_counts))

In [None]:
train_obj_counts = {k: v for k, v in sorted(train_obj_counts.items(), key=lambda item: item[1], reverse=True)}
print(len(train_obj_counts))

In [None]:
# import matplotlib.pylab as plt
# plt.figure()
# lists = subj_results_dict.items() # return a list of tuples
# x, y = zip(*lists) # unpack a list of pairs into two tuples
# plt.plot(x, y)
# plt.show()

In [None]:
subj_spanboot = {}
subj_spanbert = {}
obj_spanboot = {}
obj_spanbert = {}

from tqdm import tqdm
for ind, row in tqdm(df_results.iterrows()):
    
    subj = ' '.join(ast.literal_eval(row['subj']))
    obj = ' '.join(ast.literal_eval(row['obj']))
    
    # boot
    if subj in subj_spanboot:
        subj_spanboot[subj]['count'] = subj_spanboot[subj]['count'] + 1
    else:
        subj_spanboot[subj] = {}
        subj_spanboot[subj]['errs'] = 0
        subj_spanboot[subj]['count'] = 1
    
    if obj in obj_spanboot:
        obj_spanboot[obj]['count'] = obj_spanboot[obj]['count'] + 1
    else:
        obj_spanboot[obj] = {}
        obj_spanboot[obj]['errs'] = 0
        obj_spanboot[obj]['count'] = 1
    
    if row['relation'] != row['prediction_spanboot']:
        subj_spanboot[subj]['errs'] = subj_spanboot[subj]['errs'] + 1
        obj_spanboot[obj]['errs'] = obj_spanboot[obj]['errs'] + 1
           
            
     # bert
    if subj in subj_spanbert:
        subj_spanbert[subj]['count'] = subj_spanbert[subj]['count'] + 1
    else:
        subj_spanbert[subj] = {}
        subj_spanbert[subj]['errs'] = 0
        subj_spanbert[subj]['count'] = 1
    
    if obj in obj_spanbert:
        obj_spanbert[obj]['count'] = obj_spanbert[obj]['count'] + 1
    else:
        obj_spanbert[obj] = {}
        obj_spanbert[obj]['errs'] = 0
        obj_spanbert[obj]['count'] = 1
        
    if row['relation'] != row['prediction_spanbert']:
        subj_spanbert[subj]['errs'] = subj_spanbert[subj]['errs'] + 1   
        obj_spanbert[obj]['errs'] = obj_spanbert[obj]['errs'] + 1


In [None]:
subj_results_dict = {}
train_subj_counts_lst = []
for k, v in subj_spanboot.items():
    subj_results_dict[k] = {}
    
    if k in train_subj_counts:
        subj_results_dict[k]['train_count'] = train_subj_counts[k]
    else:
         subj_results_dict[k]['train_count'] = 0
    train_subj_counts_lst.append(subj_results_dict[k]['train_count'])
            
    subj_results_dict[k]['spanboot_errate'] = subj_spanboot[k]['errs']/subj_spanboot[k]['count']
    subj_results_dict[k]['spanbert_errate'] = subj_spanbert[k]['errs']/subj_spanbert[k]['count']


In [None]:
import statistics
mean = statistics.mean(train_subj_counts_lst)
print(mean)

In [None]:
boot_tot = 0
bert_tot = 0
tot_count = 0

for k, v in subj_results_dict.items():
    if v['train_count'] >= mean:
        boot_tot += v['spanboot_errate']
        bert_tot += v['spanbert_errate']
        tot_count += 1
        
# print(boot_tot/tot_count)
# print(bert_tot/tot_count)
gt_mean_diff = bert_tot/tot_count - boot_tot/tot_count
print(abs(gt_mean_diff)*100) 

In [None]:
boot_tot = 0
bert_tot = 0
tot_count = 0

for k, v in subj_results_dict.items():
    if v['train_count'] < mean:
        boot_tot += v['spanboot_errate']
        bert_tot += v['spanbert_errate']
        tot_count += 1
        
# print(boot_tot/tot_count)
# print(bert_tot/tot_count)
lt_mean_diff = bert_tot/tot_count - boot_tot/tot_count
print(lt_mean_diff*100) 

In [None]:
print("The difference in the (BERT-BOOT error rate) is {}x bigger below-mean train-occurrences than above-mean.".format(lt_mean_diff/gt_mean_diff))