In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
sys.path.append("/dfs/scratch0/lorr1/projects/bootleg/tutorials")
from utils import score_predictions, load_train_data
import pandas as pd
import time
import numpy as np
import cytoolz as tz
import feather
import ujson as json
import robustnessgym as rg
from multiprocessing import  Pool
import jsonlines
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
from IPython.core.display import display, HTML, Markdown
from bootleg.symbols.entity_symbols import EntitySymbols
from bootleg.symbols.type_symbols import TypeSymbols
from bootleg.symbols.kg_symbols import KGSymbols
def printmd(string):
    display(Markdown(string))
tqdm.pandas()
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 500
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5000)

In [3]:
from robustnessgym import ScoreSubpopulation, Dataset

rg.Dataset.logdir = Path("/dfs/scratch0/lorr1/robustness_gym_cache")
rg.ScoreSubpopulation.logdir = Path("/dfs/scratch0/lorr1/robustness_gym_cache")

In [4]:
def accuracy(df, crc_col="gold_qid", pred_col="pred_qid"):
    total = df.shape[0]
    correct = df[df[crc_col] == df[pred_col]].shape[0]
    return correct/total

def get_incorrect(df, crc_col="gold_qid", pred_col="pred_qid"):
    return df[df[crc_col] != df[pred_col]]

def num_incorrect(df, crc_col="gold_qid", pred_col="pred_qid"):
    return df[df[crc_col] != df[pred_col]].shape[0]

def print_incorrect_over_total(df, crc_col="gold_qid", pred_col="pred_qid"):
    num_in = df[df[crc_col] != df[pred_col]].shape[0]
    total = df.shape[0]
    print(f"{num_in} / {total} = {num_in/total}")

def errors_by_type(df, type_sys, pred_col="pred_qid"):
    errors_type = defaultdict(int)
    df["correct"] = df["gold_qid"] == df[pred_col]
    for r in df.iterrows():
        row = r[1]
        if row.correct is True:
            continue
        for t in row[type_sys]:
            errors_type[t] += 1
    return errors_type

### ------------------------------------ SKIP

In [5]:
input_dir = Path('/dfs/scratch0/lorr1/projects/bootleg-data/data/korealiases_title_0122/')
a2q = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids.json"))
entity_dump = EntitySymbols(load_dir=input_dir / "entity_db/entity_mappings")
emb_dir = Path('/dfs/scratch0/lorr1/projects/bootleg-data/embs')
types_hy = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="hyena_vocab.json", type_file="hyena_types_1229.json")
types_wd = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="wikidatatitle_to_typeid_1229.json", type_file="wikidata_types_1229.json")
types_rel = TypeSymbols(entity_dump, emb_dir, max_types=50, type_vocab_file="relation_to_typeid_1229.json", type_file="kg_relation_types_1229.json")
kg_syms = KGSymbols(entity_dump, emb_dir, "kg_adj_1229.txt")
q2title = json.load(open(input_dir / "entity_db/entity_mappings/qid2title.json"))
title2q = {v:k for k,v in q2title.items()}

Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/hyena_types_1229.json: 100%|██████████| 5832699/5832699 [00:13<00:00, 445031.91it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/wikidata_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/wikidata_types_1229.json: 100%|██████████| 5832699/5832699 [00:11<00:00, 519627.65it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg-data/embs/kg_relation_types_1229.json


Reading /dfs/scratch0/lorr1/projects/bootleg-data/embs/kg_relation_types_1229.json: 100%|██████████| 5832699/5832699 [00:12<00:00, 464966.20it/s]


Loading kg adj from /dfs/scratch0/lorr1/projects/bootleg-data/embs/kg_adj_1229.txt


100%|██████████| 29745502/29745502 [01:00<00:00, 493259.08it/s]


In [6]:
qid2cnt = defaultdict(int)
with jsonlines.open(input_dir / "train.jsonl") as in_f:
    for line in in_f:
        for qid in line["qids"]:
            qid2cnt[qid] += 1
qid2cnt = dict(qid2cnt)
with open(input_dir / "train_qidcnt.json", "w") as out_f:
    json.save(qid2cnt, out_f)

In [7]:
with open(input_dir / "train_qidcnt.json", "r") as in_f:
    qid2cnt = json.load(in_f)

### ------------------------------------ RUN

In [8]:
columns = ["sentence", "sent_idx", "is_gold_label", "alias", "span", "gold_title", "gold_qid", "pred_title", "in_cand", "pred_qid",
           "qid_cnt", "pred_qid_cnt", "aliases", "wikidata_types_1229_gld", "wikidata_types_1229_pred", "cand_names", "cand_probs"]

In [10]:
# Load training data and split
if os.path.exists("saved_data/kore_train_df.feather"):
    train_df = feather.read_dataframe("saved_data/kore_train_df.feather")
else:
    train_df =  load_train_data(input_dir / "train.jsonl", q2title, a2q, type_symbols=[types_wd, types_hy], kg_symbols=[kg_syms])
    train_df["sentence_split"] = train_df["sentence"].apply(lambda x: x.split())
    feather.write_dataframe(train_df, "saved_data/kore_train_df.feather")
train_df_split = np.array_split(train_df, 10)

### ------------------------------------ SKIP

In [11]:
pred_file = '/dfs/scratch1/lorr1/projects/bootleg/logs_guid/base/2021_01_24/23_42_08/7cf3dcc0/test/last_model/bootleg_labels.jsonl'

boot_df = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])

pred_file2 = '/dfs/scratch1/lorr1/projects/bootleg/logs_guid/ent_only/2021_01_24/21_49_49/73c62b07/test/last_model/bootleg_labels.jsonl'

boot_df_ent = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file2,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])

100%|██████████| 79969/79969 [00:30<00:00, 2611.41it/s]
100%|██████████| 79969/79969 [00:29<00:00, 2730.40it/s]


In [12]:
if "cands" in boot_df:
    boot_df["num_cands"] = boot_df["cands"].apply(lambda x: len(x))
    boot_df["cand_names"] = boot_df["cands"].apply(lambda x: [y[0] for y in x])
    boot_df["cand_probs"] = boot_df["cands"].apply(lambda x: [y[1] for y in x])
    del boot_df["cands"]
boot_df["span"] = boot_df["span"].apply(lambda x: tuple(x))
boot_df["in_cand"] = boot_df.apply(lambda x: x["gold_title"] in x["cand_names"], axis=1)
boot_df["qid_cnt"] = boot_df["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
boot_df["pred_qid_cnt"] = boot_df["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))
boot_df["pred_qid_cnt"] = boot_df["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))

In [13]:
if "cands" in boot_df_ent:
    boot_df_ent["num_cands"] = boot_df_ent["cands"].apply(lambda x: len(x))
    boot_df_ent["cand_names"] = boot_df_ent["cands"].apply(lambda x: [y[0] for y in x])
    boot_df_ent["cand_probs"] = boot_df_ent["cands"].apply(lambda x: [y[1] for y in x])
    del boot_df_ent["cands"]
boot_df_ent["span"] = boot_df_ent["span"].apply(lambda x: tuple(x))
boot_df_ent["in_cand"] = boot_df_ent.apply(lambda x: x["gold_title"] in x["cand_names"], axis=1)
boot_df_ent["qid_cnt"] = boot_df_ent["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
boot_df_ent["pred_qid_cnt"] = boot_df_ent["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))

In [14]:
merged = pd.merge(boot_df, boot_df_ent, on=["sentence", "sent_idx", "is_gold_label", "alias", "span", "gold_title", "gold_qid"], suffixes=("", "_ent"))
del merged["slices"]
print(boot_df.shape[0], boot_df_ent.shape[0], merged.shape[0])

285931 285931 285931


In [15]:
columns = ["sentence", "sent_idx", "is_gold_label", "alias", "span", "gold_title", "gold_qid",
           "pred_title", "in_cand", "pred_qid", "pred_title_ent", "in_cand_ent", "pred_qid_ent",
           "qid_cnt", "qid_cnt_ent", "pred_qid_cnt", "pred_qid_cnt_ent", "aliases", "wikidata_types_1229_gld",
           "wikidata_types_1229_pred", "wikidata_types_1229_pred_ent", "cand_names", "cand_probs", "cand_names_ent", "cand_probs_ent"]

In [16]:
print(merged[(merged["pred_qid"] != merged["pred_qid_ent"])].shape[0], merged[(merged["pred_qid"] != merged["pred_qid_ent"]) & (merged["pred_qid"] == merged["gold_qid"])].shape[0], merged[(merged["pred_qid"] != merged["pred_qid_ent"]) & (merged["gold_qid"] == merged["pred_qid_ent"])].shape[0])

53967 37468 4345


In [17]:
t = merged.to_dict(orient='list')
t["span"] = [list(x) for x in t['span']]
del t["wikidata_types_1229_gld_ent"]
del t["kg_relation_types_1229_gld_ent"]
del t["hyena_types_1229_gld_ent"]
del t["kg_adj_1229_gld_ent"]
del t["aliases_ent"]
del t["slices_ent"]
del t["alias_idx_ent"]
del t["all_gold_qids_ent"]
del t["gold_label_aliases_ent"]
del t["all_is_gold_labels_ent"]
del t["all_spans_ent"]

### ------------------------------------ RUN

In [18]:
if os.path.exists("saved_data/save_rg_d_full.rg"):
    rg_d_full = rg.Dataset.load_from_disk("saved_data/save_rg_d_full.rg")
else:
    rg_d_full = rg.Dataset.from_dict(t)
    rg_d_full.save_to_disk(path="saved_data/save_rg_d_full.rg")

In [19]:
# rg_d = rg.Dataset.from_dict(rg_d_full)
rg_d = rg_d_full

### NOTES
1. I couldn't figure out how to do with with RG subpopulations. I kind of thought that might be possible. I suppose it's a slice?
2. The cahced operators work on lists. I kind of assumed the operators would be chained and then iterated over row at a time. This also seems super not efficient.
3. Immediately wrote wrapper functions for printing accuracy etc
4. I still don't get how to chain slices? It didn't seem to show in provinence
5. I was ONE loading bar lol
6. I don't think I can make a test bench with out a model? I want to use my data.

In [20]:
%%time
rg_d = rg_d.filter(lambda x:(x["num_cands"] > 1) & (x["is_gold_label"]) & (x["in_cand"]))

CPU times: user 26.3 s, sys: 328 ms, total: 26.6 s
Wall time: 26.6 s


In [21]:
rg_d

RobustnessGymDataset(num_rows: 138500)

In [None]:
%%time
from robustnessgym import LengthSubpopulation
from robustnessgym import Spacy, Stanza, TextBlob

# Create the Spacy CachedOperation
spacy_op = Spacy()

# Apply it on the "text" column of a dataset
rg_d = spacy_op(batch_or_dataset=rg_d, columns=["sentence"])

In [None]:
# RG Helpers
import numpy as np

def accuracy(true: np.array, pred: np.array):
    """
    Your function for computing accuracy.    
    """
    return np.mean([t == p for t, p in zip(true, pred)])

def top_w(true: list, cand_probs: list, cand_names: list, threshold: float = 0.3):
    res = []
    for i, cand_p in enumerate(cand_probs):
        cand_p = np.array(cand_p)
        cand_n = np.array(cand_names[i])
        predicted_qs = [title2q[p] for p in cand_n[cand_p > threshold]]
        if any(true[i] in p for p in predicted_qs):
            res.append(1)
        else:
            res.append(0)
    return np.mean(np.array(res))

def print_metrics(slices, m_func="accuracy", suffix=""):
    metrics = {}
    for sl in slices:
        if m_func == "accuracy":
            metrics[sl.identifier] = str(accuracy(true=sl["gold_qid"], pred=sl[f'pred_qid{suffix}'])) + " (" + str(len(sl)) + ")"
        elif m_func == "top2weight":
            metrics[sl.identifier] = str(top_w(sl["gold_qid"], sl[f"cand_probs{suffix}"], sl[f"cand_names{suffix}"], 0.3)) + " (" + str(len(sl)) + ")"
        else:
            print(f"You don't have a metric for this")
            return None
    print(json.dumps(metrics, indent=4))

In [None]:
# RG Filters
def num_cands(batch, columns):
    assert len(columns) == 1, "Pass in a single column."
    return [cb for cb in batch[columns[0]]]

def type_select(type_str, batch, columns):
    assert len(columns) == 1, "Pass in a single column."
    return [int(any(type_str in t for t in ts)) for ts in batch[columns[0]]]

def words_select(word_set, batch, columns):
    assert len(columns) == 1, "Pass in a single column."
    res = Spacy.retrieve(batch=batch, columns=columns, proc_fns=tz.compose((lambda x: [len(word_set.intersection(y)) for y in x]), Spacy.tokens))
    return res

def is_team(batch, columns):
    assert len(columns) == 1, "Pass in a single column."
    ret = []
    for title in batch[columns[0]]:
        t = title.lower()
        in_list = ["national", "team"]
        not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
        r = all(i in t for i in in_list) and all(i not in t for i in not_in_list)
        ret.append(int(r))
    return ret

def test_is_team(batch, columns):
    assert len(columns) == 1, "Pass in a single column."
    ret = []
    for title in batch[columns[0]]:
        t = title.lower()
        in_list = ["national", "team"]
        not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
        r = all(i in t for i in in_list) and all(i not in t for i in not_in_list)
        r = r & ("football" in t)
        ret.append(int(r))
    return ret

def qid_cnt(batch, columns):
    """
    A simple function to compute the length of all examples in a batch.

    batch: a dict of lists
    columns: a list of str

    return: a list of lengths
    """
    assert len(columns) == 1, "Pass in a single column."

    # The name of the column to grab text from
    column_name = columns[0]
    qid_batch = batch[column_name]

    # Tokenize the text using .split() and calculate the number of tokens
    return [qid_cnt for qid_cnt in qid_batch]

In [None]:
%%time
# Restrict cand numbers
num_cands_sp = ScoreSubpopulation(intervals=[(1, 2), (2, 5), (5, 10), (10, 20), (20, 30)], score_fn=num_cands)
rg_d_nc, slices_nc, membership = num_cands_sp(batch_or_dataset=rg_d, columns=['num_cands'])

length_sp = LengthSubpopulation(intervals=[(0, 5), (50, 100), ("80%", "100%")])
rg_d_nc_s, slices_s, membership = length_sp(batch_or_dataset=rg_d_nc, columns=['sentence'])

qid_cnt_sp = ScoreSubpopulation(intervals=[(0, 0), (0, 10), (11, 1000), (1001, 50000000)], score_fn=qid_cnt)
rg_d_nc_q, slices_q, membership = qid_cnt_sp(batch_or_dataset=rg_d_nc, columns=['qid_cnt'])

In [None]:
sport_sp = ScoreSubpopulation(intervals=[(0, 0), (1, 1)], score_fn=test_is_team)
_, slices_s, membership = sport_sp(batch_or_dataset=rg_d, columns=['gold_title'])

In [None]:
print(f"Overall Accuracy {accuracy(true=rg_d['gold_qid'], pred=rg_d['pred_qid'])}")
# print_metrics(slices_nc)
print_metrics(slices_s)
print_metrics(slices_q)
print(f"Overall TopW {top_w(rg_d['gold_qid'], rg_d['cand_probs'], rg_d['cand_names'])}")
# print_metrics(slices_nc, m_func="top2weight")
print_metrics(slices_s, m_func="top2weight")
print_metrics(slices_q, m_func="top2weight")

In [None]:
print(f"Overall Accuracy {accuracy(true=rg_d['gold_qid'], pred=rg_d['pred_qid_ent'])}")
# display(slices_nc)
# print_metrics(slices_nc, suffix="_ent")
# display(slices_s)
# print_metrics(slices_s, suffix="_ent")
display(slices_q)
print_metrics(slices_q, suffix="_ent")
print(f"Overall TopW {top_w(rg_d['gold_qid'], rg_d['cand_probs_ent'], rg_d['cand_names_ent'])}")
# display(slices_nc)
# print_metrics(slices_nc, m_func="top2weight", suffix="_ent")
# display(slices_s)
# print_metrics(slices_s, m_func="top2weight", suffix="_ent")
display(slices_q)
print_metrics(slices_q, m_func="top2weight", suffix="_ent")

# Examining Rows

In [22]:
temp = pd.DataFrame(rg_d[:])
temp["sentence_split"] = temp["sentence"].apply(lambda x: x.split())

In [45]:
# Testing out LFs

def is_team_pd(row):
    title = row["gold_title"]
    t = title.lower()
    in_list = ["national", "team"]
    not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
    r = all(i in t for i in in_list) and all(i not in t for i in not_in_list)
    if r:
        return r
    title = row["pred_title"]
    t = title.lower()
    in_list = ["national", "team"]
    not_in_list = ["competition", "season", "cup", "national team nomenclature", "teamsters"]
    r = (all(i in t for i in in_list) and all(i not in t for i in not_in_list))
    return r

def around_punc(row):
    sent_split = row["sentence_split"]
    span_l, span_r = row["span"]
    punc = {"``", "''"}
    if len(punc.intersection(sent_split[span_l-2:span_l])) > 0 or len(punc.intersection(sent_split[span_r:span_r+2])) > 0:
        return True
    return False

def university_ty(row, key="wikidata_types_1229_gld"):
    gld_types = row[key]
    typs = {"university", "educational institution", "college", "school"}
    bad_typs = {"college athletic conference"}
    for t in gld_types:
        if any(tp in t for tp in typs) and t not in bad_typs:
            return True
    return False

def airport_ty(row, key="wikidata_types_1229_gld"):
    gld_types = row[key]
    typs = {"airport"}
    bad_typs = {}
    for t in gld_types:
        if any(tp in t for tp in typs) and t not in bad_typs:
            return True
    return False

def location_ty(row, key="wikidata_types_1229_gld"):
    gld_types = row[key]
    typs = {"city", "country"}
    bad_typs = {}
    for t in gld_types:
        if any(tp in t for tp in typs) and t not in bad_typs:
            return True
    return False

def album_title(row):
    return "album" in row["gold_title"].lower() or "song" in row["gold_title"].lower()

def title_pren_in_sent(row):
    if "(" in row["gold_title"]:
        in_paren = row["gold_title"].split("(")[1].split(")")[0]
        if in_paren.lower() in row["sentence"].lower().split():
            return True
    return False

def title_in_sent(row):
    span_l, span_r = row["span"]
    if len(row["gold_title"].split()) > 1:
        if row["gold_title"].lower() == " ".join(row["sentence"].lower().split()[span_l:span_r]):
            return True
    return False

univ_keywords = {"studied at", "studied at the", "studies at", "studies at the", "educated at", "educated at the",
                 "graduated from", "graduated from the", "department at", "department at the", "degree from", "degree from the",
                 "attended", "attended the", "professor at", "professor at the", "taught at", "taught at the", "univeristy of"}
def univ_keys(row):
    typs = row["wikidata_types_1229_gld"]
    span_l, span_r = row["span"]
    sent_left = row["sentence_split"][span_l-3:span_l]
    for univ_k in univ_keywords:
        if univ_k == " ".join(sent_left[-len(univ_k.split()):]):
            return True
    return False
            
        
def temp_pred(row):
    return airport_ty(row)

In [44]:
# What I print to test a LF
temp["pred"] = temp.apply(lambda x: temp_pred(x), axis=1)
to_dis = temp[(temp["pred_qid"] != temp["gold_qid"]) & (temp["pred"])]
error_sh = temp[(temp["pred_qid"] != temp["gold_qid"])].shape[0]
dis_sh = to_dis.shape[0]
pred_sh = temp[temp["pred"]].shape[0]
print(temp.shape[0], error_sh, dis_sh, dis_sh/error_sh, dis_sh/pred_sh)
display(to_dis[columns])

138500 8304 58 0.006984585741811176 0.453125


Unnamed: 0,sentence,sent_idx,is_gold_label,alias,span,gold_title,gold_qid,pred_title,in_cand,pred_qid,pred_title_ent,in_cand_ent,pred_qid_ent,qid_cnt,qid_cnt_ent,pred_qid_cnt,pred_qid_cnt_ent,aliases,wikidata_types_1229_gld,wikidata_types_1229_pred,wikidata_types_1229_pred_ent,cand_names,cand_probs,cand_names_ent,cand_probs_ent
6990,"Other easily reached airports are those of Frankfurt am Main ( ICE train stop ) , Dortmund ( railway station `` Holzwickede `` on the RE7 trainline ) and the low cost Weeze Airport ( coaches from Düsseldorf Hauptbahnhof ) .",796257,True,dortmund,"[16, 17]",Dortmund Airport,Q313587,Dortmund Hauptbahnhof,True,Q704394,Dortmund Hauptbahnhof,True,Q704394,5,5,98,98,"[dortmund, holzwickede, weeze airport]","[airport, commercial traffic aerodrome, airport_Q21836433]","[metro station, station located underground, station located on surface]","[metro station, station located underground, station located on surface]","[Borussia Dortmund, Dortmund, Dortmund Hauptbahnhof, Theater Dortmund, Dortmund Sparkassen Chess Meeting, Dortmund Airport, Dortmund (horse), List of Intercity-Express railway stations, Petra Dortmund]","[1.721e-07, 0.0104737803, 0.9754400253, 2.83353e-05, 0.0, 0.014049028, 2.51e-08, 8.6507e-06, 3e-10]","[Borussia Dortmund, Dortmund, Dortmund Hauptbahnhof, Theater Dortmund, Dortmund Sparkassen Chess Meeting, Dortmund Airport, Dortmund (horse), List of Intercity-Express railway stations, Petra Dortmund]","[0.022060087, 0.2427780479, 0.5963253379, 0.0362200327, 0.0418971181, 0.0040624035, 0.0490968265, 0.0035310497, 0.00402907]"
10509,"In April 1976 , Pan Am set the new record with its JFK – Tokyo route .",19129,True,tokyo,"[14, 15]",Haneda Airport,Q204853,Tokyo,True,Q1490,Tokyo,True,Q1490,9,9,581,581,"[pan am, jfk, tokyo]","[airport, commercial traffic aerodrome, international airport]","[city, prefecture of Japan, local government]","[city, prefecture of Japan, local government]","[Tokyo, 1964 Summer Olympics, University of Tokyo, Tokyo Station, Tokyo International Film Festival, Narita International Airport, Tokyo Stock Exchange, Haneda Airport, Greater Tokyo Area, Tokyo National Museum, Tokyo Racecourse, Tokyo Marathon, Tokyo City, Japan Open (tennis), Pan Pacific Open, Tokyo Prefecture, Tokyo String Quartet, Bombing of Tokyo, Tokyo subway, 2013 Toray Pan Pacific Open, Tokyo Metropolitan Symphony Orchestra, 2012 Toray Pan Pacific Open, Magnum Tokyo, 2011 Toray Pan P...","[0.688913703, 1.5633e-06, 3.724e-06, 0.0001802198, 2.61e-08, 0.131378457, 3.34329e-05, 0.1762113422, 0.0004837602, 1.18261e-05, 0.000736634, 1.6759e-06, 0.0005674174, 8.1e-08, 7.6e-09, 4.20337e-05, 1.06e-08, 5.5592e-06, 0.0013730461, 9.78e-08, 9.53e-08, 5.05e-08, 1.9e-09, 9.18e-08, 3.45e-08, 1.25e-07, 2.415e-07, 5.4657e-05, 1.24e-08, 1.386e-07]","[Tokyo, 1964 Summer Olympics, University of Tokyo, Tokyo Station, Tokyo International Film Festival, Narita International Airport, Tokyo Stock Exchange, Haneda Airport, Greater Tokyo Area, Tokyo National Museum, Tokyo Racecourse, Tokyo Marathon, Tokyo City, Japan Open (tennis), Pan Pacific Open, Tokyo Prefecture, Tokyo String Quartet, Bombing of Tokyo, Tokyo subway, 2013 Toray Pan Pacific Open, Tokyo Metropolitan Symphony Orchestra, 2012 Toray Pan Pacific Open, Magnum Tokyo, 2011 Toray Pan P...","[0.8340904117, 0.0021555724, 0.0077676387, 0.0004276538, 0.0163081884, 0.0132484147, 0.0025136124, 0.0131585225, 0.0005819688, 0.0005227287, 0.0004261829, 0.0143572427, 0.0004261755, 0.0346159302, 0.0021031098, 0.0359557979, 0.0024281675, 0.0029107626, 0.0059121768, 0.002567328, 0.0004238122, 0.0007075642, 0.000467564, 0.0014250765, 0.0003903164, 0.0004263507, 0.0018969687, 0.0007959623, 0.0005631851, 0.000425462]"
11298,"The airline operates bases at Atlantic City , Baltimore , Chicago–O'Hare , Dallas/Fort Worth , Detroit , Fort Lauderdale , Las Vegas , and Orlando .",23870,True,fort lauderdale,"[17, 19]",Fort Lauderdale–Hollywood International Airport,Q635361,"Fort Lauderdale, Florida",True,Q165972,"Fort Lauderdale, Florida",True,Q165972,0,0,43,43,"[atlantic city, baltimore, chicagoohare, dallasfort worth, detroit, fort lauderdale, las vegas, orlando]","[airport, commercial traffic aerodrome, international airport]","[city of the United States, gay village]","[city of the United States, gay village]","[Fort Lauderdale, Florida, Fort Lauderdale Strikers (1977–1983), Fort Lauderdale–Hollywood International Airport, Fort Lauderdale station, Fort Lauderdale station (Brightline)]","[0.8874346018, 6.2e-09, 0.1120671183, 0.0004829595, 1.53885e-05]","[Fort Lauderdale, Florida, Fort Lauderdale Strikers (1977–1983), Fort Lauderdale–Hollywood International Airport, Fort Lauderdale station, Fort Lauderdale station (Brightline)]","[0.9998563528, 3.74919e-05, 3.61226e-05, 3.42425e-05, 3.58054e-05]"
11299,"The airline operates bases at Atlantic City , Baltimore , Chicago–O'Hare , Dallas/Fort Worth , Detroit , Fort Lauderdale , Las Vegas , and Orlando .",23870,True,las vegas,"[20, 22]",McCarran International Airport,Q853886,Las Vegas,True,Q23768,Las Vegas,True,Q23768,13,13,304,304,"[atlantic city, baltimore, chicagoohare, dallasfort worth, detroit, fort lauderdale, las vegas, orlando]","[airport, commercial traffic aerodrome, international airport]","[administrative territorial entity, city of the United States, big city]","[administrative territorial entity, city of the United States, big city]","[Las Vegas, Las Vegas Valley, Las Vegas Strip, Paradise, Nevada, Las Vegas Motor Speedway, Las Vegas Aviators, Las Vegas, New Mexico, Las Vegas (TV series), Las Vegas Locomotives, McCarran International Airport, Tennis Channel Open, 2011 IZOD IndyCar World Championship, Las Vegas algorithm, Roman Catholic Diocese of Las Vegas, Caesars Palace Grand Prix, 2020 Pennzoil 400, 1982 Caesars Palace Grand Prix, 2007 Tennis Channel Open and the Mirage Cup, 1981 Caesars Palace Grand Prix, Las Vegas (M...","[0.5278810263, 0.29336375, 0.0009575977, 5.99391e-05, 1.02755e-05, 0.0003446559, 0.0054551899, 0.0001240745, 1.95638e-05, 0.170409143, 6.48143e-05, 4.9814e-06, 6.0814e-06, 7.98894e-05, 2.518e-07, 0.0002257314, 2.9971e-06, 4.0682e-06, 3.1784e-06, 2.65103e-05, 0.0004083035, 4.92238e-05, 9.9081e-06, 2.3987e-06, 4.89379e-05, 3.94487e-05, 5.0168e-06, 1.14804e-05, 4.08585e-05, 0.0003404919]","[Las Vegas, Las Vegas Valley, Las Vegas Strip, Paradise, Nevada, Las Vegas Motor Speedway, Las Vegas Aviators, Las Vegas, New Mexico, Las Vegas (TV series), Las Vegas Locomotives, McCarran International Airport, Tennis Channel Open, 2011 IZOD IndyCar World Championship, Las Vegas algorithm, Roman Catholic Diocese of Las Vegas, Caesars Palace Grand Prix, 2020 Pennzoil 400, 1982 Caesars Palace Grand Prix, 2007 Tennis Channel Open and the Mirage Cup, 1981 Caesars Palace Grand Prix, Las Vegas (M...","[0.5982843041, 0.2577376366, 0.0034641533, 0.0020675582, 0.049107831, 0.0006262379, 0.0008695034, 0.0025424077, 0.0006227022, 0.0015671434, 0.0018649712, 0.0066096084, 0.001129235, 0.0006016759, 0.0006385787, 0.0082005532, 0.0006299695, 0.0005947887, 0.0025453358, 0.000628308, 0.022270551, 0.0113048879, 0.0005865919, 0.0218655355, 0.0006104683, 0.0006135611, 0.0005928412, 0.0005978115, 0.0006279708, 0.0005972533]"
11300,"The airline operates bases at Atlantic City , Baltimore , Chicago–O'Hare , Dallas/Fort Worth , Detroit , Fort Lauderdale , Las Vegas , and Orlando .",23870,True,orlando,"[24, 25]",Orlando International Airport,Q929859,"Orlando, Florida",True,Q49233,"Orlando, Florida",True,Q49233,4,4,113,113,"[atlantic city, baltimore, chicagoohare, dallasfort worth, detroit, fort lauderdale, las vegas, orlando]","[neighborhood, airport, commercial traffic aerodrome]","[city of the United States, big city]","[city of the United States, big city]","[Orlando, Florida, Orlando Magic, Orlando Predators, Orlando City SC, Orlando Bloom, Orlande de Lassus, Barksdale Organization, Roland, Orlando International Airport, Greater Orlando, Joe Orlando, Orlando Renegades, Orlando Cepeda, Tubby Smith, Orlando Gibbons, Tony Orlando, Orlando Sanford International Airport, Orlando Jordan, Vittorio Emanuele Orlando, Orlando Letelier, Orlando Cabrera, Orlando: A Biography, Orlando Hernández, Epico Colón, Orlando Jones, Orlando (opera), Orlando Pace, Bob...","[0.4787504971, 7.061e-07, 1.42e-08, 8.79e-08, 6.35e-08, 5.71e-08, 0.0001761638, 4.203e-07, 0.2023696005, 0.0015083342, 2.039e-07, 4.153e-07, 2.748e-07, 2.175e-07, 6.2e-09, 2.0981e-06, 0.3171865046, 4.985e-07, 4.7e-09, 1.03e-08, 3.7e-08, 7.945e-07, 7.82e-08, 2.5059e-06, 1.948e-07, 2.24e-07, 1.062e-07, 4.12e-08, 3.3e-09, 2.88e-08]","[Orlando, Florida, Orlando Magic, Orlando Predators, Orlando City SC, Orlando Bloom, Orlande de Lassus, Barksdale Organization, Roland, Orlando International Airport, Greater Orlando, Joe Orlando, Orlando Renegades, Orlando Cepeda, Tubby Smith, Orlando Gibbons, Tony Orlando, Orlando Sanford International Airport, Orlando Jordan, Vittorio Emanuele Orlando, Orlando Letelier, Orlando Cabrera, Orlando: A Biography, Orlando Hernández, Epico Colón, Orlando Jones, Orlando (opera), Orlando Pace, Bob...","[0.87064749, 0.0043563219, 0.0001091701, 0.0001220487, 0.074602887, 0.0051748734, 0.0001553616, 0.0008271763, 0.0071458216, 8.22336e-05, 0.0001763842, 0.0001735624, 0.0001998397, 0.000195822, 0.0037086357, 0.0026325067, 0.0065333052, 0.0002976274, 0.0003071276, 0.0001452813, 0.0075335051, 0.005332211, 0.0002293828, 0.006515177, 0.0017343982, 0.0003057975, 9.99818e-05, 0.0001050767, 0.0003460777, 0.0002050003]"
11368,"Given the proximity to the current airport ( AICM ) , about 3 miles away , the opening of NAICM Texcoco would have required the complete shutdown of AICM and the immediate transfer of operations to the new airport - similar to Hong Kong and Munich .",24200,True,hong kong,"[42, 44]",Hong Kong International Airport,Q17704,Hong Kong,True,Q8646,Hong Kong,True,Q8646,7,7,1372,1372,"[naim, hong kong, munich]","[airport, commercial traffic aerodrome, international airport]","[human settlement, city, metropolitan area]","[human settlement, city, metropolitan area]","[Hong Kong, University of Hong Kong, British Hong Kong, Hong Kong Island, MTR, Government of Hong Kong, Hong Kong Stock Exchange, Hong Kong dollar, Hong Kong action cinema, Hongkongers, Cinema of Hong Kong, Hong Kong national football team, Hong Kong International Airport, Hong Kong national cricket team, Japanese occupation of Hong Kong, Kai Tak Airport, Battle of Hong Kong, Hong Kong Sevens, Hong Kong International Film Festival, Hong Kong Football Association, Hong Kong Tramways, Hong Kon...","[0.9062923789, 1.4765e-06, 0.0001316865, 2.16769e-05, 5.35197e-05, 7.531e-07, 1.39e-06, 5.5e-08, 6.5e-09, 1.12688e-05, 3e-10, 7.7e-09, 0.0557537265, 1e-10, 8.38e-08, 0.0377298221, 2.61e-08, 5.3e-09, 1.35e-08, 1.3e-09, 7.15e-08, 1.3e-09, 3.426e-07, 3.1e-08, 4.43e-08, 2.38e-08, 9e-10, 8e-10, 1.5112e-06, 1.259e-07]","[Hong Kong, University of Hong Kong, British Hong Kong, Hong Kong Island, MTR, Government of Hong Kong, Hong Kong Stock Exchange, Hong Kong dollar, Hong Kong action cinema, Hongkongers, Cinema of Hong Kong, Hong Kong national football team, Hong Kong International Airport, Hong Kong national cricket team, Japanese occupation of Hong Kong, Kai Tak Airport, Battle of Hong Kong, Hong Kong Sevens, Hong Kong International Film Festival, Hong Kong Football Association, Hong Kong Tramways, Hong Kon...","[0.9591022134, 0.0031228282, 0.0028005745, 0.00012476, 0.0051758043, 0.0009220592, 0.0003220717, 0.0004402421, 0.0029396298, 0.000233001, 0.0024297193, 0.0018782616, 0.0047800117, 0.00036321, 0.0045582666, 0.000342636, 0.0017222916, 0.0002803532, 0.0025159169, 0.0035359662, 0.000283955, 0.0001136235, 0.0001351104, 0.0002197732, 0.000132065, 0.0002722671, 0.000284071, 0.0001354389, 0.0006315645, 0.0002023358]"
11369,"Given the proximity to the current airport ( AICM ) , about 3 miles away , the opening of NAICM Texcoco would have required the complete shutdown of AICM and the immediate transfer of operations to the new airport - similar to Hong Kong and Munich .",24200,True,munich,"[45, 46]",Munich Airport,Q131402,Munich,True,Q1726,Munich,True,Q1726,44,44,12269,12269,"[naim, hong kong, munich]","[commercial traffic aerodrome, international airport]","[city, big city, urban municipality of Germany]","[city, big city, urban municipality of Germany]","[Munich, 1972 Summer Olympics, Ludwig Maximilian University of Munich, Munich Agreement, Technical University of Munich, Academy of Fine Arts, Munich, Bavarian State Opera, Munich S-Bahn, Munich Philharmonic, München Hauptbahnhof, Munich Airport, Munich (film), Bavarian International Tennis Championships, Munich (district), Gerhard Munthe, Munich, North Dakota, Munich (song), Henry Mosler, 2008 BMW Open, Munich (sport shoes), Names of European cities in different languages (M–P), Karl Münich...","[0.9242123961, 1.64997e-05, 0.0001416975, 0.0001269255, 9.6726e-06, 1.0024e-06, 5.6317e-06, 9.4636e-06, 1.9755e-06, 0.0003523008, 0.0750569701, 3.9844e-06, 1.605e-07, 1.96651e-05, 3.287e-07, 2.90049e-05, 9.05e-08, 2.252e-07, 2.708e-07, 6.268e-07, 1.5307e-06, 1.137e-07, 2.657e-07, 2.494e-07, 8.9925e-06]","[Munich, 1972 Summer Olympics, Ludwig Maximilian University of Munich, Munich Agreement, Technical University of Munich, Academy of Fine Arts, Munich, Bavarian State Opera, Munich S-Bahn, Munich Philharmonic, München Hauptbahnhof, Munich Airport, Munich (film), Bavarian International Tennis Championships, Munich (district), Gerhard Munthe, Munich, North Dakota, Munich (song), Henry Mosler, 2008 BMW Open, Munich (sport shoes), Names of European cities in different languages (M–P), Karl Münich...","[0.9267967343, 0.0014825419, 0.0018254021, 0.0014259683, 0.000211765, 0.0018987262, 0.0002678513, 0.0037259944, 0.0002970801, 0.0465894938, 0.004001108, 0.0011702544, 0.0009729807, 0.0014869443, 0.0004594149, 0.0012293998, 0.0008839797, 0.0004330299, 0.0006357462, 0.0010051425, 0.0007677989, 0.0004261426, 0.0009745492, 0.0004943803, 0.0005374913]"
23996,"C. joined the RAF as a pilot in 1922 and on 9 December 1922 C. was granted a short service commission as a probationary pilot officer , and joined No 2 Flying Training School , Duxford for flight training .",98002,True,duxford,"[35, 36]",Duxford Aerodrome,Q2317425,Imperial War Museum Duxford,True,Q1758240,Imperial War Museum Duxford,True,Q1758240,5,5,5,5,"[flying officer scott, raf, flying officer scott, no 2 flying training school, duxford]","[airbase, airport, airport for antique aircraft]",[military museum],[military museum],"[Duxford Aerodrome, Imperial War Museum Duxford, Duxford, Duxford, Oxfordshire]","[0.038250681, 0.91053617, 0.0503252409, 0.0008879305]","[Duxford Aerodrome, Imperial War Museum Duxford, Duxford, Duxford, Oxfordshire]","[0.1207337081, 0.5807937384, 0.2958570123, 0.0026156059]"
37513,"Scheduled flights are offered to Frankfurt , Munich , Düsseldorf , Vienna and Moscow .",176053,True,frankfurt,"[5, 6]",Frankfurt Airport,Q46033,Frankfurt,True,Q1794,Frankfurt,True,Q1794,34,34,479,479,"[frankfurt, munich, dusseldorf, vienna, moscow]","[airport, commercial traffic aerodrome, international airport]","[city, Free imperial city, big city]","[city, Free imperial city, big city]","[Frankfurt, Eintracht Frankfurt, Goethe University Frankfurt, International Motor Show Germany, Frankfurt (Oder), Frankfurt Airport, Oper Frankfurt, Frankfurt Galaxy, Frankfurt (Main) Hauptbahnhof, Frankfurt Stock Exchange, Frankfurt Book Fair, 1. FFC Frankfurt, Frankfurt Marathon, Free City of Frankfurt, Städtische Bühnen Frankfurt, Grand Duchy of Frankfurt, Harry Frankfurt, Frankfort, Free State, Trams in Frankfurt am Main, Master of Frankfurt, List of sovereign states in the 1780s, Frankf...","[0.8814504743, 4.42e-08, 1.9869e-06, 2.42e-08, 4.63844e-05, 0.1181300506, 9.7476e-06, 2.22e-08, 0.0003166129, 1.05383e-05, 1.812e-07, 5e-10, 1.839e-06, 9.2311e-06, 1.73347e-05, 1.604e-07, 7e-10, 4.6131e-06, 4.151e-07, 6.65e-08, 2e-10, 8.1e-09, 7e-10, 6.45e-08, 3e-10, 1.467e-07]","[Frankfurt, Eintracht Frankfurt, Goethe University Frankfurt, International Motor Show Germany, Frankfurt (Oder), Frankfurt Airport, Oper Frankfurt, Frankfurt Galaxy, Frankfurt (Main) Hauptbahnhof, Frankfurt Stock Exchange, Frankfurt Book Fair, 1. FFC Frankfurt, Frankfurt Marathon, Free City of Frankfurt, Städtische Bühnen Frankfurt, Grand Duchy of Frankfurt, Harry Frankfurt, Frankfort, Free State, Trams in Frankfurt am Main, Master of Frankfurt, List of sovereign states in the 1780s, Frankf...","[0.8714388013, 0.0035132002, 0.0309446119, 0.0034044285, 0.0059438474, 0.0637865737, 0.007554722, 2.23326e-05, 0.0086987615, 2.40948e-05, 2.26541e-05, 0.0011027756, 0.0008467466, 0.0008025407, 0.0014626598, 2.50058e-05, 2.19122e-05, 6.71101e-05, 7.50664e-05, 2.46099e-05, 4.84197e-05, 4.8274e-05, 2.37538e-05, 2.95168e-05, 4.37278e-05, 2.38258e-05]"
37514,"Scheduled flights are offered to Frankfurt , Munich , Düsseldorf , Vienna and Moscow .",176053,True,munich,"[7, 8]",Munich Airport,Q131402,Munich,True,Q1726,Munich,True,Q1726,44,44,12269,12269,"[frankfurt, munich, dusseldorf, vienna, moscow]","[commercial traffic aerodrome, international airport]","[city, big city, urban municipality of Germany]","[city, big city, urban municipality of Germany]","[Munich, 1972 Summer Olympics, Ludwig Maximilian University of Munich, Munich Agreement, Technical University of Munich, Academy of Fine Arts, Munich, Bavarian State Opera, Munich S-Bahn, Munich Philharmonic, München Hauptbahnhof, Munich Airport, Munich (film), Bavarian International Tennis Championships, Munich (district), Gerhard Munthe, Munich, North Dakota, Munich (song), Henry Mosler, 2008 BMW Open, Munich (sport shoes), Names of European cities in different languages (M–P), Karl Münich...","[0.9632036686, 5.6e-09, 1.2659e-06, 9e-10, 1.273e-07, 2.874e-07, 1.208e-07, 6.20592e-05, 1.232e-07, 0.0056099808, 0.0311189648, 1.43e-08, 1.052e-07, 5.958e-07, 9.1e-09, 2.081e-06, 7.2e-09, 1.277e-07, 3.26e-08, 1.36e-08, 4.88e-08, 2e-10, 3.8e-09, 1.64e-08, 3.231e-07]","[Munich, 1972 Summer Olympics, Ludwig Maximilian University of Munich, Munich Agreement, Technical University of Munich, Academy of Fine Arts, Munich, Bavarian State Opera, Munich S-Bahn, Munich Philharmonic, München Hauptbahnhof, Munich Airport, Munich (film), Bavarian International Tennis Championships, Munich (district), Gerhard Munthe, Munich, North Dakota, Munich (song), Henry Mosler, 2008 BMW Open, Munich (sport shoes), Names of European cities in different languages (M–P), Karl Münich...","[0.9051406384, 0.0016307706, 0.0010310336, 0.0010294174, 0.0005285335, 0.0014936625, 0.0004333598, 0.0049504461, 0.0002312438, 0.0771250799, 0.0029946014, 0.000288605, 0.0002244124, 0.0013728164, 8.88579e-05, 0.0002360889, 0.0001747792, 7.97295e-05, 0.0001300145, 0.0002314212, 0.0001125132, 6.8533e-05, 0.0002257594, 9.76718e-05, 7.99013e-05]"


In [None]:
# Sometimes it helps to look at all mentions in the sentence
display(temp[temp["sent_idx"] == 29423][columns])

In [46]:
# Or query training data for statistic information

def count_keywords(row):
    key_words = {"single"}
    if any(t in {"song", "single"} for t in row["wikidata_types_1229_gld"]) or "(song)" in row["gold_title"]:
        span_l, span_r = row["span"]
        if len(key_words.intersection(row["sentence_split"][span_l-3:span_l])) > 0:
            return 1
    return 0

def get_song_keywords(row):
    if any(t in {"song", "single"} for t in row["wikidata_types_1229_gld"]) or "(song)" in row["gold_title"]:
        span_l, span_r = row["span"]
        left_context = row["sentence_split"][span_l-2:span_l]
        return set(left_context)
    return {}

def get_univ_keywords(row):
    if university_ty(row):
        res = set()
        span_l, span_r = row["span"]
        sent_left = row["sentence_split"][span_l-3:span_l]
        sent_left_1 = row["sentence_split"][span_l-2:span_l]
        # res = set(sent_left)
        res.add(" ".join(sent_left))
        res.add(" ".join(sent_left_1))
        return res
    return set()

def get_airport_keywords(row):
    if airport_ty(row):
        res = set()
        span_l, span_r = row["span"]
        sent_left = row["sentence_split"][span_l-3:span_l]
        sent_left_1 = row["sentence_split"][span_l-2:span_l]
        # res = set(sent_left)
        res.add(" ".join(sent_left))
        res.add(" ".join(sent_left_1))
        return res
    return set()

def get_country_keywords(row):
    if location_ty(row):
        res = set()
        span_l, span_r = row["span"]
        sent_left = row["sentence_split"][span_l-3:span_l]
        sent_left_1 = row["sentence_split"][span_l-2:span_l]
        # res = set(sent_left)
        res.add(" ".join(sent_left))
        res.add(" ".join(sent_left_1))
        return res
    return set()

airport_keywords = {"based at", "international airport", "airport", "flights to", "flights from", "flight to", "flight from",
                   "hub at", "bound for", "bound to", "connections to", "route to", "route from", "arriving at", "arrive at",
                   "departing from", "depart from"}

univ_keywords = {"studied at", "studied at the", "studies at", "studies at the", "educated at", "educated at the",
                 "graduated from", "graduated from the", "department at", "department at the", "degree from", "degree from the",
                 "attended", "attended the", "professor at", "professor at the", "taught at", "taught at the", "univeristy of"}
def count_univ_keywords(row):
    typs = set()
    span_l, span_r = row["span"]
    sent_left = row["sentence_split"][span_l-3:span_l]
    for univ_k in univ_keywords:
        if univ_k == " ".join(sent_left[-len(univ_k.split()):]):
            typs.update(set(row["wikidata_types_1229_gld"]))
    return typs

def city_univ_keywords(row):
    pairs = set()
    span_l, span_r = row["span"]
    sent_left = row["sentence_split"][span_l-3:span_l]
    for univ_k in univ_keywords:
        if univ_k == " ".join(sent_left[-len(univ_k.split()):]):
            for ty in row["wikidata_types_1229_gld"]:
                if "city" in ty or "country" in ty or "municipality" in ty:
                    pairs.add(tuple([row["sent_idx"], row["alias_idx"]]))
    return pairs

result_df = train_df.progress_apply(get_country_keywords, axis=1)

100%|██████████| 2265988/2265988 [00:56<00:00, 40178.52it/s]


In [None]:
merg = pd.merge(train_df, pd.DataFrame(data = {"sent_idx": [v[0] for vs in result_df for v in vs], "alias_idx": [v[1] for vs in result_df for v in vs]}))
display(merg.sample(10))

In [47]:
res = defaultdict(int)
for d in result_df.values:
    for k in d:
        if type(d) is dict:
            res[k] += d[k]
        else:
            res[k] += 1

In [None]:
country_keywords = {"born in", "found in", "located in", "suburb of"}

In [48]:
print([[k, v] for k, v in sorted(res.items(), key=lambda x: x[1], reverse=True)][:150])

[['', 12477], [', and', 7536], ['born in', 6609], ['was born in', 5921], ['Paris ,', 4097], ['China ,', 4091], ['in the', 4064], ['moved to', 3878], [', in', 3835], ['found in', 3787], ['London ,', 3764], [') ,', 3350], ['is found in', 3114], ['based in', 2903], ['of the', 2697], [', the', 2679], ['Victoria ,', 2674], ['is a', 2544], ['city of', 2527], ['in Paris ,', 2165], ['held in', 2119], [', China ,', 2086], ['Born in', 2063], ['Yorkshire ,', 2011], ['and the', 2002], ['in London ,', 1911], ['Olympics in', 1896], ['Greece ,', 1882], ['India ,', 1842], ['England ,', 1829], ['Summer Olympics in', 1792], ['the city of', 1771], ['France ,', 1752], ['China and', 1717], ['was a', 1599], ['located in', 1519], ['Beijing ,', 1514], ['Athens ,', 1506], ['Japan ,', 1484], ['Italy ,', 1483], ['Munich ,', 1414], ['England and', 1401], [') in', 1390], [', Greece ,', 1370], ['Germany ,', 1303], ['such as', 1290], ['Chicago ,', 1277], ['died in', 1275], [', France ,', 1273], [', India ,', 1232], 

# Junk Testing

In [None]:
print(top_w(rg_d["gold_qid"], rg_d["cand_probs_ent"], rg_d["cand_names_ent"], 0.3))
print(accuracy(rg_d["gold_qid"], rg_d["pred_qid_ent"]))

In [None]:
word_set = {"that"}
words_select(word_set, rg_d[:2], ["sentence"])

In [30]:
def test_lf(row):
    t = pd.DataFrame(data = {"a": [5], "b": [len(row["gold_title"])]})
    return t

new_test = temp.progress_apply(lambda x: test_lf(x), axis=1)

100%|██████████| 138500/138500 [01:31<00:00, 1518.14it/s]


In [42]:
print(train_df.columns)
display(train_df.head(1))

Index(['sentence', 'sent_idx', 'aliases', 'span', 'slices', 'alias',
       'alias_idx', 'is_gold_label', 'gold_qid', 'gold_title', 'all_gold_qids',
       'gold_label_aliases', 'all_is_gold_labels', 'all_spans', 'cand_names',
       'wikidata_types_1229_gld', 'hyena_types_1229_gld', 'kg_adj_1229_gld',
       'sentence_split'],
      dtype='object')


Unnamed: 0,sentence,sent_idx,aliases,span,slices,alias,alias_idx,is_gold_label,gold_qid,gold_title,all_gold_qids,gold_label_aliases,all_is_gold_labels,all_spans,cand_names,wikidata_types_1229_gld,hyena_types_1229_gld,kg_adj_1229_gld,sentence_split
0,In 2005 a new rule was put into effect that there were to be no `` energy drinks `` such as Red Bull or Sobe allowed on the university campus for the duration of conference .,72195,"[red bull, sobe]","[21, 23]","[K_most_popular_all, K_most_popular_TO, K_most_popular_NS_all, K_most_popular_NS_TO, K_most_popular_NPOP_all, K_most_popular_NPOP_TO, hard_to_disambig_all, hard_to_disambig_TO, hard_to_disambig_NS_all, hard_to_disambig_NS_TO, hard_to_disambig_NPOP_all, hard_to_disambig_NPOP_TO, unif_all, unif_TO, unif_NS_all, unif_NS_TO, unif_NPOP_all, unif_NPOP_TO, 3_wd_aff_all, 3_wd_aff_TO, 3_wd_aff_NS_all, 3_wd_aff_NS_TO, 3_wd_aff_NPOP_all, 3_wd_aff_NPOP_TO, 1_wd_aff_all, 1_wd_aff_TO, 1_wd_aff_NS_all, 1_w...",red bull,0,True,Q51482,Red Bull,"[Q51482, Q7548977]","[red bull, sobe]","[True, True]","[[21, 23], [24, 25]]","[Red Bull Racing, Red Bull, Barako Bull Energy Boosters, Red Bull GmbH, Red Bull Junior Team, Red Bull Theatre, Red Bull Racing Team]","[trademark, energy drink]",[<wordnet_artifact_100021939>],[],"[In, 2005, a, new, rule, was, put, into, effect, that, there, were, to, be, no, ``, energy, drinks, ``, such, as, Red, Bull, or, Sobe, allowed, on, the, university, campus, for, the, duration, of, conference, .]"
