In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
sys.path.append("/dfs/scratch0/lorr1/projects/bootleg/tutorials")
from utils import score_predictions, load_train_data
import pandas as pd
import time
import numpy as np
import cytoolz as tz
import feather
import ujson as json
from multiprocessing import  Pool
import jsonlines
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
from IPython.core.display import display, HTML, Markdown
from bootleg.symbols.entity_symbols import EntitySymbols
from bootleg.symbols.type_symbols import TypeSymbols
from bootleg.symbols.kg_symbols import KGSymbols
def printmd(string):
    display(Markdown(string))
tqdm.pandas()
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 500
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5000)

In [3]:
def accuracy(df, crc_col="gold_qid", pred_col="pred_qid"):
    total = df.shape[0]
    correct = df[df[crc_col] == df[pred_col]].shape[0]
    return correct/total if total > 0 else 0

def get_incorrect(df, crc_col="gold_qid", pred_col="pred_qid"):
    return df[df[crc_col] != df[pred_col]]

def num_incorrect(df, crc_col="gold_qid", pred_col="pred_qid"):
    return df[df[crc_col] != df[pred_col]].shape[0]

def print_incorrect_over_total(df, crc_col="gold_qid", pred_col="pred_qid"):
    num_in = df[df[crc_col] != df[pred_col]].shape[0]
    total = df.shape[0]
    print(f"{num_in} / {total} = {num_in/total}")

def errors_by_type(df, type_sys, pred_col="pred_qid"):
    errors_type = defaultdict(int)
    df["correct"] = df["gold_qid"] == df[pred_col]
    for r in df.iterrows():
        row = r[1]
        if row.correct is True:
            continue
        for t in row[type_sys]:
            errors_type[t] += 1
    return errors_type

def apply_lfs(df, lfs):
    for lf in lfs:
        df[f"lf_{lf.__name__}"] = df.progress_apply(lf, axis=1)
        subset = df[df[f"lf_{lf.__name__}"]]
        lf_acc = accuracy(subset)
        print(f"LF: {lf.__name__} Acc: {lf_acc} Supp: {subset.shape[0]} Overall Acc: {accuracy(df)} Overall Size: {df.shape[0]}")
    return df

In [4]:
input_dir = Path('/dfs/scratch0/lorr1/projects/bootleg-data/data/korealiases_title_0122/')
a2q = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids.json"))
entity_dump = EntitySymbols(load_dir=input_dir / "entity_db/entity_mappings")
emb_dir = Path('/dfs/scratch0/lorr1/projects/bootleg-data/embs')
types_hy = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="hyena_vocab.json", type_file="hyena_types_1229.json")
types_wd = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="wikidatatitle_to_typeid_1229.json", type_file="wikidata_types_1229.json")
types_rel = TypeSymbols(entity_dump, emb_dir, max_types=50, type_vocab_file="relation_to_typeid_1229.json", type_file="kg_relation_types_1229.json")
kg_syms = KGSymbols(entity_dump, emb_dir, "kg_adj_1229.txt")
q2title = json.load(open(input_dir / "entity_db/entity_mappings/qid2title.json"))
title2q = {v:k for k,v in q2title.items()}

Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_1229.json: 100%|██████████| 5832699/5832699 [00:11<00:00, 525424.32it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/wikidata_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/wikidata_types_1229.json: 100%|██████████| 5832699/5832699 [00:10<00:00, 546280.42it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/kg_relation_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/kg_relation_types_1229.json: 100%|██████████| 5832699/5832699 [00:11<00:00, 491232.69it/s]


Loading kg adj from /dfs/scratch0/lorr1/projects/bootleg-data/embs/kg_adj_1229.txt


100%|██████████| 29745502/29745502 [00:54<00:00, 542332.55it/s]


In [5]:
qid2cnt = defaultdict(int)
alias2qid2count = defaultdict(lambda: defaultdict(int))
with jsonlines.open(input_dir / "train.jsonl") as in_f:
    for line in in_f:
        for al, qid in zip(line["aliases"], line["qids"]):
            qid2cnt[qid] += 1
            alias2qid2count[al][qid] += 1
qid2cnt = dict(qid2cnt)
alias2qid2count = dict(alias2qid2count)
with open(input_dir / "train_qidcnt.json", "w") as out_f:
    json.save(qid2cnt, out_f)
with open(input_dir / "train_alias2qidcnt.json", "w") as out_f:
    json.save(alias2qid2count, out_f)

In [6]:
columns = ["sentence", "sent_idx", "is_gold_label", "alias", "span", "gold_title", "gold_qid", "pred_title", "in_cand", "pred_qid",
           "qid_cnt", "pred_qid_cnt", "aliases", "wikidata_types_1229_gld", "wikidata_types_1229_pred", "cand_names", "cand_probs"]

In [7]:
# Read in train data
# Load training data and split
if os.path.exists("saved_data/kore_train_df.feather"):
    train_df = feather.read_dataframe("saved_data/kore_train_df.feather")
else:
    train_df =  load_train_data(input_dir / "train.jsonl", q2title, a2q, type_symbols=[types_wd, types_hy], kg_symbols=[kg_syms])
    train_df["sentence_split"] = train_df["sentence"].apply(lambda x: x.split())
    feather.write_dataframe(train_df, "saved_data/kore_train_df.feather")

100%|██████████| 637185/637185 [03:28<00:00, 3050.71it/s] 


In [None]:
mods = {}
pred_file = '/dfs/scratch1/lorr1/projects/bootleg/logs_guid/base/2021_01_23/23_45_31/2f2b98c2/test/last_model/bootleg_labels.jsonl'

boot_df = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])
mods["boot"] = boot_df

pred_file2 = '/dfs/scratch1/lorr1/projects/bootleg/logs_guid/ent_only/2021_01_23/23_48_06/75fdf972/test/last_model/bootleg_labels.jsonl'

boot_df_ent = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file2,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])
mods["ent"] = boot_df_ent

pred_file = '/dfs/scratch1/lorr1/projects/bootleg/logs_guid/base_wl_socc_air/2021_01_30/16_39_08/ed7aec6f/test/last_model/bootleg_labels.jsonl'

guid_df = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])
mods["boot_g"] = guid_df

pred_file2 = '/dfs/scratch1/lorr1/projects/bootleg/logs_guid/ent_only_wl_socc_air/2021_01_30/16_48_10/b51ac111/test/checkpoint_2.0/bootleg_labels.jsonl'

guid_df_ent = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file2,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])
mods["ent_g"] = guid_df_ent

In [21]:
for k, df in mods.items():
    if "cands" in df:
        df["num_cands"] = df["cands"].apply(lambda x: len(x))
        df["cand_names"] = df["cands"].apply(lambda x: [y[0] for y in x])
        df["cand_probs"] = df["cands"].apply(lambda x: [y[1] for y in x])
        del df["cands"]
    df["sentence_split"] = df["sentence"].apply(lambda x: x.split())
    df["span"] = df["span"].apply(lambda x: tuple(x))
    df["in_cand"] = df.apply(lambda x: x["gold_title"] in x["cand_names"], axis=1)
    df["qid_cnt"] = df["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
    df["pred_qid_cnt"] = df["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))
    df["pred_qid_cnt"] = df["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))
    mods[k] = df

In [33]:
# Testing out LFs

def is_team_pd(row):
    title = row["gold_title"]
    t = title.lower()
    in_list = ["national", "team"]
    not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
    r = all(i in t for i in in_list) and all(i not in t for i in not_in_list)
    if r:
        return r
    title = row["pred_title"]
    t = title.lower()
    in_list = ["national", "team"]
    not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
    r = (all(i in t for i in in_list) and all(i not in t for i in not_in_list))
    return r

def is_soccer_team(row):
    t = row["gold_title"].lower()
    in_list = ["national", "football", "team"]
    not_in_list = [
        "competition",
        "season",
        "cup",
        "national team nomenclature",
        "teamsters",
    ]
    r = all(i in t for i in in_list) and all(i not in t for i in not_in_list)
    return r

def around_punc(row):
    sent_split = row["sentence_split"]
    span_l, span_r = row["span"]
    punc = {"``", "''"}
    if len(punc.intersection(sent_split[span_l-2:span_l])) > 0 or len(punc.intersection(sent_split[span_r:span_r+2])) > 0:
        return True
    return False

def university_ty(row, key="wikidata_types_1229_gld"):
    gld_types = row[key]
    typs = {"university", "educational institution", "college", "school"}
    bad_typs = {"college athletic conference"}
    for t in gld_types:
        if any(tp in t for tp in typs) and t not in bad_typs:
            return True
    return False

def airport_ty(row, key="wikidata_types_1229_gld"):
    gld_types = row[key]
    typs = {"airport"}
    bad_typs = {}
    for t in gld_types:
        if any(tp in t for tp in typs) and t not in bad_typs:
            return True
    return False

def location_ty(row, key="wikidata_types_1229_gld"):
    gld_types = row[key]
    typs = {"city", "country"}
    bad_typs = {}
    for t in gld_types:
        if any(tp in t for tp in typs) and t not in bad_typs:
            return True
    return False

def album_title(row):
    return "album" in row["gold_title"].lower() or "song" in row["gold_title"].lower()

def title_pren_in_sent(row):
    if "(" in row["gold_title"]:
        in_paren = row["gold_title"].split("(")[1].split(")")[0]
        if in_paren.lower() in row["sentence"].lower().split():
            return True
    return False

def title_in_sent(row):
    span_l, span_r = row["span"]
    if len(row["gold_title"].split()) > 1:
        if row["gold_title"].lower() == " ".join(row["sentence"].lower().split()[span_l:span_r]):
            return True
    return False

univ_keywords = {"studied at", "studied at the", "studies at", "studies at the", "educated at", "educated at the",
                 "graduated from", "graduated from the", "department at", "department at the", "degree from", "degree from the",
                 "attended", "attended the", "professor at", "professor at the", "taught at", "taught at the", "univeristy of"}
def univ_keys(row):
    typs = row["wikidata_types_1229_gld"]
    span_l, span_r = row["span"]
    sent_left = row["sentence_split"][span_l-3:span_l]
    for univ_k in univ_keywords:
        if univ_k == " ".join(sent_left[-len(univ_k.split()):]):
            return True
    return False
            
        
def temp_pred(row):
    return is_soccer_team(row)

In [36]:
# What I print to test a LF
temp = mods["ent"]
temp["pred"] = temp.apply(lambda x: temp_pred(x), axis=1)
to_dis = temp[(temp["pred_qid"] != temp["gold_qid"]) & (temp["pred"])]
error_sh = temp[(temp["pred_qid"] != temp["gold_qid"])].shape[0]
dis_sh = to_dis.shape[0]
pred_sh = temp[temp["pred"]].shape[0]
print(temp.shape[0], error_sh, dis_sh, dis_sh/error_sh, dis_sh/pred_sh)
display(to_dis[columns])

285931 56784 253 0.004455480417018878 0.163860103626943


Unnamed: 0,sentence,sent_idx,is_gold_label,alias,span,gold_title,gold_qid,pred_title,in_cand,pred_qid,qid_cnt,pred_qid_cnt,aliases,wikidata_types_1229_gld,wikidata_types_1229_pred,cand_names,cand_probs
840,"In their third and final game , Russia defeated Greece 2-1 .",758228,True,greece,"(9, 10)",Greece national football team,Q134925,Greece men's national basketball team,True,Q331640,864,296,[greece],[national association football team],[national sports team],"[Greece, Greek mythology, Ancient Greece, Super League Greece, Greek Basket League, Greek War of Independence, Greece national football team, Football League Greece, Macedonia (Greece), Greek Civil War, Kingdom of Greece, Battle of Greece, Hellenic Navy, Greco-Italian War, Greek junta, Classical Greece, Axis occupation of Greece, Greece men's national basketball team, Church of Greece, Greek government-debt crisis, Hellenic Air Force, Greek cuisine, Ottoman Greece, Greece national under-21 f...","[0.0321845636, 0.0001899839, 3.82287e-05, 0.0226366799, 0.0001285177, 0.0001108148, 0.2419474572, 0.000299731, 6.55627e-05, 6.9603e-06, 2.85573e-05, 3.68239e-05, 0.000110786, 4.26064e-05, 1.95967e-05, 2.67994e-05, 1.59901e-05, 0.5141977072, 1.4953e-05, 0.0001248154, 0.0009965369, 4.45936e-05, 2.06049e-05, 0.1864584237, 2.69132e-05, 3.04932e-05, 9.57646e-05, 2.37304e-05, 1.44238e-05, 6.14033e-05]"
8975,Australia met Brazil in their second Group F game in Munich on 18 June .,781468,True,brazil,"(2, 3)",Brazil national football team,Q83459,Brazil women's national football team,True,Q757014,85,28,"[brazil, munich]",[national association football team],[women's national association football team],"[Brazil, Brazil national football team, Campeonato Brasileiro Série A, Brazilians, Brazil women's national football team, Empire of Brazil, Brazilian Navy, Brazilian Football Confederation, Military dictatorship in Brazil, Colonial Brazil, Football in Brazil, Brazil national under-20 football team, Brazil men's national basketball team, Constitution of Brazil, Music of Brazil, Brazil (1985 film), Brazil national under-23 football team, Brazil Squadron, Dutch Brazil, Brazil women's national v...","[0.073721014, 0.3629779518, 0.0003633957, 0.0001824134, 0.4986165762, 0.0002468404, 0.0004006325, 0.000205679, 0.0002003806, 0.0002112051, 0.0001742689, 0.0072335959, 0.0528873466, 8.61271e-05, 0.0002160669, 0.000265742, 9.70845e-05, 0.0001847276, 0.0001786653, 9.41582e-05, 0.0002847928, 0.0001456073, 0.0001236833, 0.0001606768, 8.91585e-05, 0.0001358843, 9.21316e-05, 0.000160872, 0.000156909, 0.0001064859]"
9315,"Nikos Tsiantakis 's appearance at the 1994 World Cup rounded off Nikos Tsiantakis 's national team career , which lasted from 1988 to 1994 , giving Nikos Tsiantakis 47 caps and 2 international goals .",782280,True,national team,"(14, 16)",Greece national football team,Q134925,Ireland national rugby union team,False,Q599903,864,261,"[tsiantakis, 1994 world cup, tsiantakis, national team, tsiantakis]",[national association football team],[national rugby union team],"[England national football team, Scotland national football team, New Zealand national rugby union team, Germany national football team, Italy national football team, Brazil national football team, United States men's national soccer team, Australia national cricket team, France national football team, Spain national football team, Argentina national football team, India national cricket team, Netherlands national football team, Wales national rugby union team, Republic of Ireland national f...","[0.0083726207, 0.0295856763, 0.0404866375, 0.0131809795, 0.007699484, 0.0250097644, 0.0552609377, 0.0012091803, 0.0258169509, 0.050889682, 0.018396182, 0.0560190268, 0.0164397955, 0.0379931815, 0.0111297704, 0.0826239362, 0.0158744939, 0.0154920388, 0.0319024324, 0.0364510231, 0.044805266, 0.0823890641, 0.0454786085, 0.0415028743, 0.0186212156, 0.0411132686, 0.0138756689, 0.018677827, 0.0130335297, 0.100668788]"
14428,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,czech republic,"(19, 21)",Czech Republic national under-17 football team,Q3590346,Czech Republic national football team,True,Q483868,0,34,"[kuipers, kuipers, czech republic, russia, germany, england]",[national association football team],[national association football team],"[Czech Republic, Czech Extraliga, International Federation of the Phonographic Industry, Czech Republic national football team, Czech Republic men's national ice hockey team, Czech Republic national under-21 football team, Czech Republic women's national football team, Czech Republic national under-19 football team, National Basketball League (Czech Republic), Czech Republic men's national basketball team, Czech Republic Fed Cup team, Czech Republic women's national volleyball team, Czech cu...","[0.0092526423, 0.0001224372, 0.0062992978, 0.5683862567, 2.96349e-05, 0.4100238979, 0.0032897091, 0.0001240776, 1.87112e-05, 0.0001239332, 0.0001504653, 0.0001194903, 0.0001422774, 0.0001240724, 0.0001256564, 2.37694e-05, 0.0001257397, 0.0001229677, 0.0001289596, 0.0001273689, 1.61508e-05, 0.0001244609, 0.0001405966, 0.000116109, 0.0001236163, 0.0001280744, 0.0001221878, 0.0001222993, 0.0001260686, 0.0001190431]"
14429,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,russia,"(22, 23)",Russia national under-17 football team,Q2054093,Russia national football team,False,Q726080,0,46,"[kuipers, kuipers, czech republic, russia, germany, england]",[national association football team],[national association football team],"[Russia, Soviet Union, Russian Empire, Russian language, Russians, Russian Soviet Federative Socialist Republic, Eastern Front (World War II), October Revolution, Russian Revolution, Russian Orthodox Church, 2018 FIFA World Cup, Russian Premier League, Russo-Japanese War, Russian Far East, Kievan Rus', Grand Duchy of Moscow, Russia national football team, Tsardom of Russia, French invasion of Russia, Eastern Front (World War I), Russian Navy, History of the Jews in Russia, House of Romanov, ...","[0.0652524456, 0.0028788133, 0.007022508, 0.0012403131, 0.0021393392, 0.0008865555, 0.0028360728, 0.0004036001, 0.0007091859, 0.0002885257, 0.0880803466, 0.0005675798, 0.0013077811, 0.000673459, 0.0005345064, 0.0002037979, 0.7354440093, 0.0003134633, 0.0004575374, 0.0003354663, 0.0002231573, 0.0003232238, 0.0004399779, 0.0003105888, 0.0002065456, 0.0002937042, 0.0002376389, 0.066259861, 0.0001483842, 0.0199816395]"
14430,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,germany,"(39, 40)",Germany national under-21 football team,Q314851,Germany national football team,True,Q43310,17,194,"[kuipers, kuipers, czech republic, russia, germany, england]","[national association football team, Germany national youth football team]",[national association football team],"[Germany, Nazi Germany, East Germany, Germans, West Germany, German Empire, Holy Roman Empire, Bundesliga, Kingdom of Prussia, Germany national football team, Weimar Republic, 2006 FIFA World Cup, German Americans, GfK Entertainment charts, Allied-occupied Germany, Basketball Bundesliga, Football in Germany, History of the Jews in Germany, German Confederation, German Football Association, German Revolution of 1918–1919, Cinema of Germany, German Navy, Germany women's national football team,...","[0.0191035941, 0.0021578483, 0.0001935307, 0.0019490938, 0.0003453471, 0.0008307944, 8.2914e-05, 0.0029862113, 0.0002213697, 0.8842013478, 0.0001104208, 0.0186769925, 0.0001302877, 3.86071e-05, 9.65651e-05, 8.63319e-05, 0.0047752755, 0.0001144264, 7.72474e-05, 0.0065707397, 7.69143e-05, 1.38116e-05, 4.06329e-05, 0.0233368035, 0.0001102906, 8.84268e-05, 4.85514e-05, 2.9489e-05, 2.69776e-05, 0.0334790312]"
14431,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,england,"(41, 42)",England national under-21 football team,Q204238,England national football team,True,Q47762,816,4499,"[kuipers, kuipers, czech republic, russia, germany, england]",[national association football team],[national association football team],"[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[0.0003168454, 0.015395944, 5.83741e-05, 0.0001012913, 4.86841e-05, 2.3223e-05, 5.27191e-05, 0.9722917676, 2.9704e-06, 0.0001290851, 7.94819e-05, 2.17759e-05, 0.000116759, 2.13982e-05, 1.7137e-06, 0.0007665108, 3.4668e-06, 5.61058e-05, 0.0002273222, 0.0001306297, 0.0021931008, 8.7808e-06, 5.99584e-05, 6.33113e-05, 2.72888e-05, 0.0077509703, 2.30553e-05, 4.4147e-06, 1.05889e-05, 1.24303e-05]"
16087,Quincy played Quincy 's last game for the Dutch on 14 November 2006 in a 1–0 friendly loss to England in Alkmaar .,3733,True,england,"(19, 20)",England national under-21 football team,Q204238,England national football team,True,Q47762,816,4499,"[quincy, quincy, england, alkmaar]",[national association football team],[national association football team],"[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[0.0006210138, 0.000993344, 0.0001324702, 3.58828e-05, 1.35068e-05, 6.6628e-06, 2.84198e-05, 0.9526507854, 1.1471e-06, 5.58087e-05, 3.67338e-05, 6.8733e-06, 0.0001254397, 6.7492e-06, 6.932e-07, 0.0013441764, 1.1974e-06, 9.04708e-05, 0.0009361499, 5.19927e-05, 0.0056552072, 8.8068e-06, 2.1496e-05, 2.07219e-05, 1.62278e-05, 0.0371224508, 6.4263e-06, 1.2532e-06, 3.9161e-06, 3.9945e-06]"
17154,"An England v Wales match was played at the ground in 1907 , followed by a rugby league international between England and Australia in 1911 .",6967,True,england,"(1, 2)",England national football team,Q47762,England national rugby league team,True,Q3589698,4499,1436,"[england, wales, england, australia]",[national association football team],"[national sports team, rugby league team]","[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[3.1771e-06, 0.0010257283, 6.771e-07, 1.2775e-06, 5.502e-07, 4.576e-07, 4.971e-07, 8.28674e-05, 6.15e-08, 1.469e-06, 9.642e-07, 3.155e-07, 0.0003018471, 3.215e-07, 2.53e-08, 0.3806052208, 9.15e-08, 7.2364e-06, 4.7945e-06, 1.5372e-06, 0.6179069877, 3.429e-07, 5.938e-07, 6.67e-07, 6.602e-07, 4.85868e-05, 4.547e-07, 8.425e-07, 1.4038e-06, 3.094e-07]"
17155,"An England v Wales match was played at the ground in 1907 , followed by a rugby league international between England and Australia in 1911 .",6967,True,wales,"(3, 4)",Wales national football team,Q180857,Wales national rugby union team,True,Q822877,263,279,"[england, wales, england, australia]",[national association football team],[national rugby union team],"[Wales, Welsh people, Wales national football team, Charles, Prince of Wales, Wales national rugby union team, Wales national rugby league team, Senedd Cymru – Welsh Parliament, Welsh Government, University of Wales, Welsh Rugby Union, Church in Wales, BBC Cymru Wales, Henry Frederick, Prince of Wales, Wales national under-21 football team, Football Association of Wales, Frederick, Prince of Wales, Jimmy Wales, Historic counties of Wales, Wales women's national football team, Wales national ...","[0.0285712965, 0.0001857568, 0.2139480114, 0.0002317339, 0.6542015672, 0.0701854154, 0.0001610095, 8.23442e-05, 0.000294405, 0.0054920018, 4.06296e-05, 0.0001422702, 9.53413e-05, 0.0131824315, 0.0107212169, 0.0001517415, 0.0003217163, 0.0001246046, 0.0001465816, 0.0002036266, 0.0001157679, 0.0001263692, 0.0001928959, 0.0002261615, 0.0001737521, 0.000153893, 0.0001311529, 0.0001168996, 0.0001455525, 0.0001338047]"


In [37]:
# What I print to test a LF
temp = mods["ent_g"]
temp["pred"] = temp.apply(lambda x: temp_pred(x), axis=1)
to_dis = temp[(temp["pred_qid"] != temp["gold_qid"]) & (temp["pred"])]
error_sh = temp[(temp["pred_qid"] != temp["gold_qid"])].shape[0]
dis_sh = to_dis.shape[0]
pred_sh = temp[temp["pred"]].shape[0]
print(temp.shape[0], error_sh, dis_sh, dis_sh/error_sh, dis_sh/pred_sh)
display(to_dis[columns])

285931 54871 322 0.005868309307284358 0.20854922279792745


Unnamed: 0,sentence,sent_idx,is_gold_label,alias,span,gold_title,gold_qid,pred_title,in_cand,pred_qid,qid_cnt,pred_qid_cnt,aliases,wikidata_types_1229_gld,wikidata_types_1229_pred,cand_names,cand_probs
840,"In their third and final game , Russia defeated Greece 2-1 .",758228,True,greece,"(9, 10)",Greece national football team,Q134925,Greece men's national basketball team,True,Q331640,864,296,[greece],[national association football team],[national sports team],"[Greece, Greek mythology, Ancient Greece, Super League Greece, Greek Basket League, Greek War of Independence, Greece national football team, Football League Greece, Macedonia (Greece), Greek Civil War, Kingdom of Greece, Battle of Greece, Hellenic Navy, Greco-Italian War, Greek junta, Classical Greece, Axis occupation of Greece, Greece men's national basketball team, Church of Greece, Greek government-debt crisis, Hellenic Air Force, Greek cuisine, Ottoman Greece, Greece national under-21 f...","[0.0657375529, 1.19382e-05, 5.51036e-05, 0.0014547963, 3.35592e-05, 8.5178e-06, 0.1008318439, 0.0001978761, 1.8055e-06, 3.987e-07, 3.39394e-05, 2.46667e-05, 8.457e-07, 1.2992e-06, 1.1934e-06, 3.30608e-05, 8.04e-07, 0.8013042212, 5.886e-07, 0.0013011417, 0.0009726551, 0.0005507836, 2.2396e-06, 0.0271249656, 1.04137e-05, 1.2586e-05, 1.70121e-05, 9.7102e-06, 1.7038e-06, 0.0002628242]"
9315,"Nikos Tsiantakis 's appearance at the 1994 World Cup rounded off Nikos Tsiantakis 's national team career , which lasted from 1988 to 1994 , giving Nikos Tsiantakis 47 caps and 2 international goals .",782280,True,national team,"(14, 16)",Greece national football team,Q134925,Portugal national football team,False,Q267245,864,95,"[tsiantakis, 1994 world cup, tsiantakis, national team, tsiantakis]",[national association football team],[national association football team],"[England national football team, Scotland national football team, New Zealand national rugby union team, Germany national football team, Italy national football team, Brazil national football team, United States men's national soccer team, Australia national cricket team, France national football team, Spain national football team, Argentina national football team, India national cricket team, Netherlands national football team, Wales national rugby union team, Republic of Ireland national f...","[0.0017670112, 0.0277295783, 1.67758e-05, 0.0584468879, 0.0496840924, 0.0485377014, 0.0777179226, 6.84148e-05, 0.0577830411, 0.0688135624, 0.0427530743, 0.0011527675, 0.0452967659, 0.0002409038, 0.0250730496, 0.008482811, 0.0484714583, 0.0560531244, 0.0829693824, 0.0654109865, 0.0119680213, 0.0077812071, 0.0085020214, 4.09329e-05, 0.0166664515, 0.0521972515, 0.0048073404, 0.0704647824, 0.0580746271, 0.0030281374]"
10170,"Geoff made Geoff 's debut for Canada in a March 1991 North American Championship match against Mexico , but since this game was not regarded official Geoff won Geoff 's first senior cap coming on as a sub in an April 1992 friendly match against China .",784592,True,mexico,"(16, 17)",Mexico national football team,Q164089,Mexico women's national football team,True,Q599923,34,6,"[aunger, aunger, canada, mexico, aunger, aunger, china]",[national association football team],[women's national association football team],"[Mexico, Mexico City, New Spain, Mexico national football team, Liga MX, Lucha libre, Mexican League, State of Mexico, Mexican cuisine, Second French intervention in Mexico, Mexican Navy, Mexico women's national football team, Mexicana Universal, Mexican Football Federation, First Mexican Empire, Autódromo Hermanos Rodríguez, Second Mexican Empire, Cinema of Mexico, Mexico national baseball team, COVID-19 pandemic in Mexico, Tigres de Quintana Roo, Roman Catholic Archdiocese of Mexico, Mexic...","[0.0011361684, 1.68692e-05, 4.448e-07, 0.2481831014, 3.295e-06, 0.0006355531, 1.04e-07, 1.0072e-06, 3.20117e-05, 5.28e-08, 6.448e-07, 0.3738825023, 1.097e-07, 9.99e-08, 5.11e-08, 4.252e-07, 5.694e-07, 1.8408e-06, 5.15e-08, 6.48e-08, 1.36e-07, 5.11e-08, 0.0007378158, 0.251003772, 0.0596038252, 4.41e-08, 5.07e-08, 3.005e-07, 2.4288e-06, 0.0647566319]"
12047,"In 1954 Cox played Cox 's last match for Scotland ; Cox captained the team at Hampden in a 4–2 defeat by England in front of 134,544 spectators .",789883,True,england,"(22, 23)",England national football team,Q47762,England national rugby union team,True,Q378628,4499,1964,"[samuel richmond cox, samuel richmond cox, samuel richmond cox, england]",[national association football team],[national rugby union team],"[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[0.0001176179, 0.0130130984, 1.35568e-05, 2.27247e-05, 1.746e-07, 4.4508e-06, 5.0663e-06, 0.1812147647, 1.076e-07, 7.89939e-05, 5.6296e-06, 8.393e-07, 0.0737665817, 3.95e-07, 1.821e-07, 0.7030943036, 5.804e-07, 0.000188959, 2.87201e-05, 5.1311e-05, 0.0261645354, 0.0001288665, 2.6415e-06, 5.35478e-05, 6.7013e-05, 0.00179876, 0.0001496489, 3.7882e-06, 9.2425e-06, 1.38241e-05]"
14428,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,czech republic,"(19, 21)",Czech Republic national under-17 football team,Q3590346,Czech Republic national football team,True,Q483868,0,34,"[kuipers, kuipers, czech republic, russia, germany, england]",[national association football team],[national association football team],"[Czech Republic, Czech Extraliga, International Federation of the Phonographic Industry, Czech Republic national football team, Czech Republic men's national ice hockey team, Czech Republic national under-21 football team, Czech Republic women's national football team, Czech Republic national under-19 football team, National Basketball League (Czech Republic), Czech Republic men's national basketball team, Czech Republic Fed Cup team, Czech Republic women's national volleyball team, Czech cu...","[0.0959945917, 1.303e-07, 5.11727e-05, 0.8036256433, 5.4594e-06, 0.0612425543, 0.0014318177, 0.0176457297, 5.376e-07, 1.267e-07, 1.1016e-06, 1.305e-07, 1.1014e-05, 1.269e-07, 1.284e-07, 1.4464e-06, 1.305e-07, 1.272e-07, 1.289e-07, 0.0066135521, 2.928e-07, 2.7962e-06, 1.0657e-06, 1.813e-07, 1.282e-07, 1.314e-07, 1.326e-07, 1.205e-07, 1.299e-07, 0.0133693274]"
14429,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,russia,"(22, 23)",Russia national under-17 football team,Q2054093,Russia national football team,False,Q726080,0,46,"[kuipers, kuipers, czech republic, russia, germany, england]",[national association football team],[national association football team],"[Russia, Soviet Union, Russian Empire, Russian language, Russians, Russian Soviet Federative Socialist Republic, Eastern Front (World War II), October Revolution, Russian Revolution, Russian Orthodox Church, 2018 FIFA World Cup, Russian Premier League, Russo-Japanese War, Russian Far East, Kievan Rus', Grand Duchy of Moscow, Russia national football team, Tsardom of Russia, French invasion of Russia, Eastern Front (World War I), Russian Navy, History of the Jews in Russia, House of Romanov, ...","[0.0667539164, 0.0002734284, 0.0019230765, 3.97051e-05, 0.0004912685, 2.7975e-06, 4.8375e-06, 6.87e-07, 2.1022e-06, 1.01014e-05, 0.0252123605, 7.30182e-05, 1.88723e-05, 3.7612e-06, 2.71301e-05, 2.8345e-06, 0.8434243202, 1.88824e-05, 9.418e-07, 3.776e-07, 1.4865e-06, 6.092e-07, 4.999e-07, 2.96157e-05, 8.6519e-06, 9.891e-07, 1.656e-07, 0.0613581017, 6.86e-07, 0.0003147958]"
14430,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,germany,"(39, 40)",Germany national under-21 football team,Q314851,Germany national football team,True,Q43310,17,194,"[kuipers, kuipers, czech republic, russia, germany, england]","[national association football team, Germany national youth football team]",[national association football team],"[Germany, Nazi Germany, East Germany, Germans, West Germany, German Empire, Holy Roman Empire, Bundesliga, Kingdom of Prussia, Germany national football team, Weimar Republic, 2006 FIFA World Cup, German Americans, GfK Entertainment charts, Allied-occupied Germany, Basketball Bundesliga, Football in Germany, History of the Jews in Germany, German Confederation, German Football Association, German Revolution of 1918–1919, Cinema of Germany, German Navy, Germany women's national football team,...","[0.0246327035, 0.0001320946, 4.3305e-06, 0.0008899257, 1.18918e-05, 0.0002127894, 2.4602e-06, 7.00499e-05, 3.183e-07, 0.8617281318, 1.9944e-06, 0.0064811772, 0.0003270341, 3.6106e-06, 4.732e-07, 3.241e-07, 7.28726e-05, 5.43594e-05, 3.7654e-05, 0.0006774163, 2.986e-07, 5.52684e-05, 3.539e-07, 0.0734033138, 6.43334e-05, 0.0007323723, 5.52e-08, 3.339e-07, 2.2254e-06, 0.030399872]"
14431,"During the initial stages of Björn 's career , Kuipers refereed the 2006 UEFA European Under-17 Championship Final between Czech Republic and Russia in Luxembourg on 14 May 2006 and also the 2009 UEFA European Under-21 Championship Final between Germany and England in Sweden on 29 June 2009 .",796370,True,england,"(41, 42)",England national under-21 football team,Q204238,England national football team,True,Q47762,816,4499,"[kuipers, kuipers, czech republic, russia, germany, england]",[national association football team],[national association football team],"[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[2.41829e-05, 0.0082601439, 5.705e-06, 1.91783e-05, 7.17e-08, 1.10348e-05, 4.3605e-06, 0.9900675416, 4.4e-08, 8.97401e-05, 6.161e-06, 1.0371e-06, 9.98453e-05, 2.872e-07, 9e-08, 0.000165222, 6.566e-07, 3.52672e-05, 7.8743e-06, 5.87383e-05, 0.0004820441, 1.37563e-05, 3.2409e-06, 5.33147e-05, 2.8743e-06, 0.0004538528, 0.0001144304, 2.192e-07, 2.2128e-06, 1.67594e-05]"
16087,Quincy played Quincy 's last game for the Dutch on 14 November 2006 in a 1–0 friendly loss to England in Alkmaar .,3733,True,england,"(19, 20)",England national under-21 football team,Q204238,England national football team,True,Q47762,816,4499,"[quincy, quincy, england, alkmaar]",[national association football team],[national association football team],"[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[3.67431e-05, 0.0020133047, 4.9033e-06, 5.16248e-05, 1.344e-07, 1.46729e-05, 6.6963e-06, 0.9794986844, 1.788e-07, 0.0002252686, 8.6814e-06, 1.0961e-06, 0.0003037298, 3.79e-07, 2.53e-07, 0.003848057, 1.8283e-06, 0.0003815729, 7.9775e-06, 0.0001651971, 0.0069889487, 8.28881e-05, 3.4163e-06, 0.0001760357, 4.91074e-05, 0.0059070573, 0.0001904289, 3.868e-07, 1.1942e-06, 2.94698e-05]"
17154,"An England v Wales match was played at the ground in 1907 , followed by a rugby league international between England and Australia in 1911 .",6967,True,england,"(1, 2)",England national football team,Q47762,England national rugby league team,True,Q3589698,4499,1436,"[england, wales, england, australia]",[national association football team],"[national sports team, rugby league team]","[United Kingdom, England, Great Britain, Premier League, Church of England, English people, British Empire, England national football team, James VI and I, Marylebone Cricket Club, United Kingdom of Great Britain and Ireland, Parliament of England, England cricket team, Kingdom of Great Britain, James II of England, England national rugby union team, Edward I of England, Kingdom of England, Premiership Rugby, The Football Association, England national rugby league team, Henry VII of England,...","[0.0001490949, 0.0011203132, 8.051e-07, 4.682e-07, 6.6e-09, 3.527e-07, 1.324e-07, 0.0001277158, 7.2e-09, 2.1275e-06, 2.042e-07, 5.59e-08, 0.0001533742, 1.88e-08, 1.13e-08, 0.3060930669, 6.64e-08, 8.768e-07, 2.7176e-06, 1.2577e-06, 0.692322135, 6.46e-07, 1.274e-07, 1.2903e-06, 3.011e-07, 1.72519e-05, 2.1212e-06, 1.1991e-06, 1.9669e-06, 3.041e-07]"


# Junk

In [110]:
print("G:C, P:T", (254-201)/201)
print("G:T, P:C", (216-161)/216)

print("G:C, P:T", (190-154)/154)
print("G:T, P:C", (151-106)/151)

G:C, P:T 0.263681592039801
G:T, P:C 0.25462962962962965
G:C, P:T 0.23376623376623376
G:T, P:C 0.2980132450331126
