In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import os, sys
sys.path.append("/dfs/scratch0/lorr1/projects/bootleg-emmental/tutorials")
from utils import score_predictions, load_train_data
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from functools import partial
import pyarrow as pa
import json
import jsonlines
from scipy import stats
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
from IPython.core.display import display, HTML, Markdown
from bootleg_emmental.symbols.entity_symbols import EntitySymbols
from bootleg_emmental.symbols.type_symbols import TypeSymbols
from bootleg_emmental.symbols.kg_symbols import KGSymbols

from nltk.stem import PorterStemmer

ps = PorterStemmer()
def printmd(string):
    display(Markdown(string))
tqdm.pandas()
display(HTML("<style>.container { width:90% !important; }</style>"))
pd.options.display.max_colwidth = 500
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5000)

In [91]:
SPORT_WORDS = set([l.strip() for l in open("/dfs/scratch1/lorr1/projects/robogym_bootleg/sport_words.txt")])
SPORT_WORDS = set(ps.stem(w) for w in SPORT_WORDS)
SPORT_WORDS_ADJ = set([l.strip() for l in open("/dfs/scratch1/lorr1/projects/robogym_bootleg/new_sport_words.txt")])
SPORT_WORDS_ADJ = set(ps.stem(w) for w in SPORT_WORDS_ADJ)
from spacy.lang.en.stop_words import STOP_WORDS
import string
punc = [c for c in string.punctuation]

In [4]:
def accuracy(df, crc_col="gold_qid", pred_col="pred_qid"):
    total = df.shape[0]
    correct = df[df[crc_col] == df[pred_col]].shape[0]
    return correct/total if total > 0 else 0

def get_incorrect(df, crc_col="gold_qid", pred_col="pred_qid"):
    return df[df[crc_col] != df[pred_col]]

def num_incorrect(df, crc_col="gold_qid", pred_col="pred_qid"):
    return df[df[crc_col] != df[pred_col]].shape[0]

def print_incorrect_over_total(df, crc_col="gold_qid", pred_col="pred_qid"):
    num_in = df[df[crc_col] != df[pred_col]].shape[0]
    total = df.shape[0]
    print(f"{num_in} / {total} = {num_in/total}")

def errors_by_type(df, type_sys, pred_col="pred_qid"):
    errors_type = defaultdict(int)
    df["correct"] = df["gold_qid"] == df[pred_col]
    for r in df.iterrows():
        row = r[1]
        if row.correct is True:
            continue
        for t in row[type_sys]:
            errors_type[t] += 1
    return errors_type

def apply_lfs(df, lfs):
    for lf in lfs:
        df[f"lf_{lf.__name__}"] = df.progress_apply(lf, axis=1)
        subset = df[df[f"lf_{lf.__name__}"]]
        lf_acc = accuracy(subset)
        print(f"LF: {lf.__name__} Acc: {lf_acc} Supp: {subset.shape[0]} Overall Acc: {accuracy(df)} Overall Size: {df.shape[0]}")
    return df

In [5]:
# input_dir = '/dfs/scratch0/lorr1/projects/bootleg/data/wiki_0906_pg/'
# input_dir = Path('/dfs/scratch0/lorr1/projects/bootleg/data/korealiases_title_1229/')
# input_dir = Path('/dfs/scratch0/lorr1/projects/bootleg/data/personal_model_1217_title/orig_cands')
# a2q_orig = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids.json"))
input_dir = Path('/dfs/scratch0/lorr1/projects/bootleg/data/sports_notitle_1229')
a2q_orig = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids.json"))
# input_dir_ctx = Path('/dfs/scratch0/lorr1/projects/bootleg/data/personal_model_1217_title/contextual_cands')
# a2q_ctx = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids.json"))
entity_dump = EntitySymbols(load_dir=input_dir / "entity_db/entity_mappings")
emb_dir = Path('/dfs/scratch0/lorr1/projects/bootleg/embs')
types_hy = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="hyena_vocab.json", type_file="hyena_types_0905.json")
types_wd = TypeSymbols(entity_dump, emb_dir, max_types=3, type_vocab_file="wikidata_to_typeid_0905.json", type_file="wikidata_types_0905.json")
types_rel = TypeSymbols(entity_dump, emb_dir, max_types=50, type_vocab_file="relation_to_typeid_0905.json", type_file="kg_relation_types_0905.json")
kg_syms = KGSymbols(entity_dump, emb_dir, "kg_adj_0905.txt")
wdtypes2title = json.load(open(emb_dir / "wikidatatypeid_to_title_0905.json"))
q2title = json.load(open(input_dir / "entity_db/entity_mappings/qid2title.json"))
title2q = {v:k for k,v in q2title.items()}

Loading types from /dfs/scratch0/lorr1/projects/bootleg/embs/hyena_types_0905.json


Reading /dfs/scratch0/lorr1/projects/bootleg/embs/hyena_types_0905.json: 100%|██████████| 5310039/5310039 [00:14<00:00, 359032.66it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg/embs/wikidata_types_0905.json


Reading /dfs/scratch0/lorr1/projects/bootleg/embs/wikidata_types_0905.json: 100%|██████████| 5310039/5310039 [00:11<00:00, 463682.53it/s]


Loading types from /dfs/scratch0/lorr1/projects/bootleg/embs/kg_relation_types_0905.json


Reading /dfs/scratch0/lorr1/projects/bootleg/embs/kg_relation_types_0905.json: 100%|██████████| 5310039/5310039 [00:12<00:00, 432956.88it/s]


Loading kg adj from /dfs/scratch0/lorr1/projects/bootleg/embs/kg_adj_0905.txt


100%|██████████| 25730507/25730507 [00:48<00:00, 529085.62it/s]


In [6]:
qid2cnt = defaultdict(int)
alias2qid2count = defaultdict(lambda: defaultdict(int))
with jsonlines.open(input_dir / "train.jsonl") as in_f:
    for line in in_f:
        for al, qid in zip(line["aliases"], line["qids"]):
            qid2cnt[qid] += 1
            alias2qid2count[al][qid] += 1
qid2cnt = dict(qid2cnt)
alias2qid2count = dict(alias2qid2count)
with open(input_dir / "train_qidcnt.json", "w") as out_f:
    json.save(qid2cnt, out_f)
with open(input_dir / "train_alias2qidcnt.json", "w") as out_f:
    json.save(alias2qid2count, out_f)

In [7]:
# with open(input_dir / "filtered_data" / "train_qidcnt.json", "r") as in_f:
#     qid2cnt = json.load(in_f)

# with open(input_dir_ctx / "filtered_data" / "train_qidcnt.json", "r") as in_f:
#     qid2cnt_ctx = json.load(in_f)

In [8]:
# Read in train data
train_df = load_train_data(input_dir / "train.jsonl",
                           title_map=q2title,
                           cands_map=a2q_orig,
                           type_symbols=[types_hy, types_wd, types_rel],
                           kg_symbols=[kg_syms])

100%|██████████| 497565/497565 [02:18<00:00, 3588.28it/s]


In [10]:
pred_file = '/dfs/scratch0/lorr1/projects/bootleg-emmental/logs/robogym_sports_cnt/2021_01_16/00_05_06/5e4dab08/test/checkpoint_5.0/bootleg_labels.jsonl'

boot_ctx_df = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file,
                 title_map=q2title,
                 cands_map=a2q_orig,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])

pred_file = '/dfs/scratch0/lorr1/projects/bootleg-emmental/logs/robogym_sports/2021_01_15/14_09_18/77b28a8e/test/checkpoint_5.0/bootleg_labels.jsonl'
# a2q_aug = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids_aug.json"))
boot_aug_df = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file,
                 title_map=q2title,
                 cands_map=a2q_orig,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])

pred_file = '/dfs/scratch0/lorr1/projects/bootleg-emmental/logs/robogym_sports_wl/2021_01_16/12_54_48/fb8eeefc/test/checkpoint_5.0/bootleg_labels.jsonl'
# a2q_aug = json.load(open(input_dir / "entity_db/entity_mappings/alias2qids_aug.json"))
boot_wl_df = score_predictions(orig_file=input_dir / 'test.jsonl',
                 pred_file=pred_file,
                 title_map=q2title,
                 cands_map=a2q_orig,
                 type_symbols=[types_hy, types_wd, types_rel],
                 kg_symbols=[kg_syms])

models = {"orig": boot_ctx_df, "aug": boot_aug_df, "wl": boot_wl_df}

100%|██████████| 62006/62006 [00:19<00:00, 3204.40it/s]
100%|██████████| 62006/62006 [00:37<00:00, 1674.48it/s]
100%|██████████| 62006/62006 [00:19<00:00, 3213.37it/s]


In [93]:
import spacy
from nltk.stem.snowball import SnowballStemmer
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stop_words = set(nlp.Defaults.stop_words)
if "against" in stop_words:
    stop_words.remove("against")
stemmer = SnowballStemmer(language='english')
split_text_len = 3

def add_text_left(row):
    span_l, span_r = row["span"]
    split_text = row["sentence_split"]
    left_text = split_text[max(0, span_l-split_text_len):span_l]
    left_text = [stemmer.stem(t) for t in left_text]
    return left_text

def add_text_right(row):
    span_l, span_r = row["span"]
    split_text = row["sentence_split"]
    right_text = split_text[span_r:span_l+split_text_len]
    right_text = [stemmer.stem(t) for t in right_text]
    return right_text

def has_sport_word(sentence_split):
    sent = set(ps.stem(w.lower()) for w in sentence_split)
    return len(sent.intersection(SPORT_WORDS_ADJ)) > 0

# print(tail.shape, toes.shape)
models2 = {}
for k in models:
    res = models[k].copy()
    res['is_tail'] = res['slices'].apply(lambda x: 'unif_TL' in x)
    res['is_toes'] = res['slices'].apply(lambda x: 'unif_TS' in x)
    res['is_torso'] = res['slices'].apply(lambda x: 'unif_TO' in x)
    res["num_cands"] = res["cands"].apply(lambda x: len(x))
    res["qid_cnt"] = res["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
    res["correct"] = res["pred_qid"] == res["gold_qid"]
    res["sentence_split"] = res["sentence"].progress_apply(lambda x: x.split())
    res["left_ctx"] = res.progress_apply(lambda x: add_text_left(x), axis=1)
    res["right_ctx"] = res.progress_apply(lambda x: add_text_right(x), axis=1)
    res["has_sport_word"] = res["sentence_split"].progress_apply(lambda x: has_sport_word(x)) 

    res = res[(res["is_gold_label"]) & (res["num_cands"] > 1)]

    tail = res[(res['is_tail'])]
    torso = res[(res['is_torso'])]
    toes = res[(res['is_toes'])]

    print(tail.shape, toes.shape, res.shape, boot_ctx_df.shape)
    print(k)
    print("TORSO", accuracy(torso, pred_col="pred_qid"))
    print("TAIL", accuracy(tail, pred_col="pred_qid"))
    print("TOES", accuracy(toes, pred_col="pred_qid"))
    print("ALL", accuracy(res, pred_col="pred_qid"))
    
    models2[k] = res

100%|██████████| 240225/240225 [00:00<00:00, 286002.30it/s]
100%|██████████| 240225/240225 [00:10<00:00, 23579.74it/s]
100%|██████████| 240225/240225 [00:06<00:00, 38174.12it/s]
100%|██████████| 240225/240225 [01:34<00:00, 2533.19it/s]


(14257, 36) (3477, 36) (160252, 36) (240225, 26)
orig
TORSO 0.9265137914150482
TAIL 0.7545065581819457
TOES 0.6635030198446937
ALL 0.9391957666674987


100%|██████████| 240225/240225 [00:00<00:00, 285041.50it/s]
100%|██████████| 240225/240225 [00:09<00:00, 24549.43it/s]
100%|██████████| 240225/240225 [00:06<00:00, 38410.75it/s]
100%|██████████| 240225/240225 [01:38<00:00, 2428.88it/s]


(14257, 36) (3477, 36) (160252, 36) (240225, 26)
aug
TORSO 0.9263709809041946
TAIL 0.7565406466998668
TOES 0.6695427092320967
ALL 0.9391084042632853


100%|██████████| 240225/240225 [00:00<00:00, 273669.06it/s]
100%|██████████| 240225/240225 [00:09<00:00, 24315.85it/s]
100%|██████████| 240225/240225 [00:06<00:00, 36772.06it/s]
100%|██████████| 240225/240225 [01:35<00:00, 2528.32it/s]


(14257, 36) (3477, 36) (160252, 36) (240225, 26)
wl
TORSO 0.9290231761057614
TAIL 0.7553482499824647
TOES 0.6669542709232097
ALL 0.9389523999700472


In [94]:
train_res = train_df
train_res["num_cands"] = train_res["cand_names"].apply(lambda x: len(x))
train_res["qid_cnt"] = train_res["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
train_res["sentence_split"] = train_res["sentence"].progress_apply(lambda x: x.split())
train_res["left_ctx"] = train_res.progress_apply(lambda x: add_text_left(x), axis=1)
train_res["right_ctx"] = train_res.progress_apply(lambda x: add_text_right(x), axis=1)
train_res["has_sport_word"] = res["sentence_split"].progress_apply(lambda x: has_sport_word(x))

100%|██████████| 1919459/1919459 [00:06<00:00, 278235.24it/s]
100%|██████████| 1919459/1919459 [01:46<00:00, 17977.96it/s]
100%|██████████| 1919459/1919459 [00:50<00:00, 38150.19it/s]
100%|██████████| 160252/160252 [01:03<00:00, 2508.45it/s]


In [95]:
print(train_df.columns)
single_cols = ["sentence", "sent_idx", "alias", "gold_title", "pred_title", "qid_cnt", "wikidata_types_0905_gld", "kg_relation_types_0905_gld",
               "kg_adj_0905_gld", "wikidata_types_0905_pred", "kg_relation_types_0905_pred", "kg_adj_0905_pred", "cands"]

Index(['sentence', 'sent_idx', 'aliases', 'span', 'slices', 'alias',
       'alias_idx', 'is_gold_label', 'gold_qid', 'gold_title', 'all_gold_qids',
       'gold_label_aliases', 'all_is_gold_labels', 'all_spans', 'cand_names',
       'hyena_types_0905_gld', 'wikidata_types_0905_gld',
       'kg_relation_types_0905_gld', 'kg_adj_0905_gld', 'num_cands', 'qid_cnt',
       'sentence_split', 'left_ctx', 'right_ctx', 'has_sport_word'],
      dtype='object')


In [17]:
# Get pairs of sport team - country pairs from triples of KG connections

kg_triples_f = emb_dir / "kg_triples_0905.txt"
country_of_sport = "P1532"

qids_pairs = defaultdict(set)
num_lines = sum(1 for _ in open(kg_triples_f))
with open(kg_triples_f) as in_f:
    for line in tqdm(in_f, total=num_lines):
        line = line.strip().split()
        if line[1] == country_of_sport:
            ql = line[0]
            qr = line[2]
            qids_pairs[ql].add(qr)
            qids_pairs[qr].add(ql)

100%|██████████| 35179217/35179217 [00:22<00:00, 1580778.41it/s]


In [18]:
print(len(qids_pairs))

88945


In [96]:
def compute_acc_at_weight(df, weight=0.8):
    # Compute the accuracy in terms of what candidates are in its top weight probability (where the model puts weight percent of it's probability)
    total = df.shape[0]
    num_correct = 0
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        gold = row["gold_qid"]
        cands = sorted(row["cands"], key=lambda x: x[1], reverse=True)
        cum_sum = np.cumsum(np.array(list(map(lambda x: x[1], cands))))
        # Modify so we remove the 100% mark and add 0% -> enusre we get candidates whose weights takes us over the cutoff
        # E.g., weight = 0.9 and the first choice candidate got prob 0.99. We need to keep only the first candidate
        cum_sum = np.append(np.array([0]), cum_sum[:-1])
        in_weight_cutoff = [title2q[q[0]] for q in np.array(cands)[cum_sum <= weight]]
        if gold in in_weight_cutoff:
            num_correct += 1
    return num_correct/total

def compute_type_error(df, topk=3):
    # Gets errors for model bucketized by unsure, cocky, or semi
    unif_probs = np.array([1]*topk)/topk
    result = {"correct": {"cocky": 0, "semi": 0, "unsure": 0, "all": 0}, "incorrect": {"cocky": 0, "semi": 0, "unsure": 0, "all": 0}}
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        cands = sorted(row["cands"], key=lambda x: x[1], reverse=True)[:topk]
        if len(cands) < topk:
            continue

        correct_key = "correct" if row["pred_qid"] == row["gold_qid"] else "incorrect"
        result[correct_key]["all"] += 1
        probs = np.array(list(map(lambda x: x[1], cands)))
        probs = probs/probs.sum()
        unif_err = np.abs(probs-unif_probs).sum()/topk
        if unif_err < 0.1:
            result[correct_key]["unsure"] += 1
        elif probs.max() > 0.9:
            result[correct_key]["cocky"] += 1
        else:
            result[correct_key]["semi"] += 1
    return result

def compute_error_pairset(df, pairs):
    # Computers errors counting predictions such that the gold qid is in the country relationship pairs mined above
    total = df.shape[0]
    correct_margino_pairs = 0
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        if row["pred_qid"] == row["gold_qid"]:
            correct_margino_pairs += 1
        else:
            chosen_title, chosen_prob = sorted(row["cands"], key=lambda x: x[1], reverse=True)[0]
            chosen_qid = title2q[chosen_title]
            gold_qid = row["gold_qid"]
            if gold_qid in pairs and row["pred_qid"] in pairs[gold_qid]:
                correct_margino_pairs += 1
    return correct_margino_pairs/total


def is_country(types):
    return any("country" in t or "island nation" in t for t in types)

def is_team(title):
    t = title.lower()
    return "national" in t and "team" in t and "competition" not in t and "season" not in t and "cup" not in t and "national team nomenclature" not in t and "teamsters" not in t

def is_subteam(title):
    title = title.lower()
    return ("women" in title or "under-" in title or "junior" in title or "men" in title)

def get_sport(title):
    title = title.lower()
    words_to_remove = ["england", "national", "association", "team", "women's", "women", "under-15",
                       "under-21", "under-18", "under-17", "under-19", "beach", "olympic", "secondary", 
                       "international", "league", "union", "competition", "teams"]
    filt = [word for word in title.split() if word.lower() not in words_to_remove]
    sport_key = all_sports.intersection(filt)
    if len(sport_key) > 1:
        print("BAD", sport_key)
    elif len(sport_key) <= 0:
        if title == "national sports team":
            return "NaN"
        print("EVEN WORSE", filt, title)    
    return list(sport_key)[0]

def get_country(title):
    title = title
    words_to_remove = set(["inline", "sevens","ice", "field", "national", "association", "team", 
                       "men's", "men", "youth", "water", "a", "women's", "women", "under-23", 
                       "under-16", "under-21", "under-18", "under-17", "under-19", "under-20", 
                       "beach", "olympic", "secondary", "international", "league", "union", 
                       "competition", "teams", "boys'", "roller", "sitting", "wheelchair",
                       "junior", "flag", "blind", "b", "c", "indoor", "rules", "long", "track"])
    filt = [word for word in title.split() if word.lower() not in words_to_remove]
    filt = [word for word in filt if word.lower() not in all_sports]
    return " ".join(filt)

def compute_sport_error_categories(df, pairs):
    error_cats = defaultdict(lambda: defaultdict(int))
    error_sents = defaultdict(lambda: defaultdict(list))
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        gold_types = row["wikidata_types_0905_gld"]
        pred_types = row["wikidata_types_0905_pred"]
        gold_title = row["gold_title"]
        pred_title = row["pred_title"]
        if not (is_team(pred_title) or is_team(gold_title)):
            continue
        error_type = "wrong" if row["pred_qid"] != row["gold_qid"] else "right"
        has_spt_word = has_sport_word(row["sentence_split"])
        # 1. Predicted country instead of sport team and some sport team keyword
        if has_spt_word and (is_country(pred_types) and is_team(gold_title)):
            error_cats["error_1"][error_type] += 1
            error_cats["error_1"]["all"] += 1
            error_sents["error_1"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 2. Predicted country instead of sport team and no keyword
        elif (not has_spt_word) and is_country(pred_types) and is_team(gold_title):
            error_cats["error_2"][error_type] += 1
            error_cats["error_2"]["all"] += 1
            error_sents["error_2"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 3. Predicted sport team instead of country and some sport team keyword
        elif has_spt_word and is_country(gold_types) and is_team(pred_title):
            error_cats["error_3"][error_type] += 1
            error_cats["error_3"]["all"] += 1
            error_sents["error_3"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 4. Predicted sport team instead of country and no keyword
        elif (not has_spt_word) and is_country(gold_types) and is_team(pred_title):
            error_cats["error_4"][error_type] += 1
            error_cats["error_4"]["all"] += 1
            error_sents["error_4"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 5. Predicted a correct sport team but wrong gender or age group
        elif is_team(pred_title) and is_team(gold_title) and is_subteam(pred_title) and (get_sport(pred_title) == get_sport(gold_title)):
            error_cats["error_5"][error_type] += 1
            error_cats["error_5"]["all"] += 1
            error_sents["error_5"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 6. Predicted a correct sport team but wrong gender or age group
        elif is_team(pred_title) and is_team(gold_title) and is_subteam(gold_title) and (get_sport(pred_title) == get_sport(gold_title)):
            error_cats["error_6"][error_type] += 1
            error_cats["error_6"]["all"] += 1
            error_sents["error_6"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 7. Predicted wrong sport
        elif is_team(pred_title) and is_team(gold_title) and (get_sport(pred_title) != get_sport(gold_title)):
            error_cats["error_7"][error_type] += 1
            error_cats["error_7"]["all"] += 1
            error_sents["error_7"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        # 8. Predicted correct sport
        elif is_team(pred_title) and is_team(gold_title) and (get_sport(pred_title) == get_sport(gold_title)):
            error_cats["error_8"][error_type] += 1
            error_cats["error_8"]["all"] += 1
            error_sents["error_8"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
        else:
            error_cats["error_9"][error_type] += 1
            error_cats["error_9"]["all"] += 1
            error_sents["error_9"]["men_idx"].append([row["sent_idx"], row["alias_idx"]])
    return error_cats, error_sents


In [97]:
all_sports = {"football", "rugby", "cricket", "tennis", "handball", "quidditch", "baseball", "korfball", "rollball", "bandy", "bobsleigh", "basketball", "athletics", "soccer", "hockey", "volleyball", "futsal", "badminton", "polo", "speedway", "netball", "lacrosse", "kabaddi", "softball"}
error_cats, error_sents = compute_sport_error_categories(models2["orig"], qids_pairs)

100%|██████████| 160252/160252 [00:18<00:00, 8824.79it/s]


In [98]:
error_cats

defaultdict(<function __main__.compute_sport_error_categories.<locals>.<lambda>()>,
            {'error_5': defaultdict(int,
                         {'right': 750, 'all': 873, 'wrong': 123}),
             'error_8': defaultdict(int,
                         {'right': 2842, 'all': 2857, 'wrong': 15}),
             'error_6': defaultdict(int, {'wrong': 191, 'all': 191}),
             'error_7': defaultdict(int, {'wrong': 136, 'all': 136}),
             'error_1': defaultdict(int, {'wrong': 151, 'all': 151}),
             'error_9': defaultdict(int, {'wrong': 67, 'all': 67}),
             'error_2': defaultdict(int, {'wrong': 65, 'all': 65}),
             'error_3': defaultdict(int, {'wrong': 154, 'all': 154}),
             'error_4': defaultdict(int, {'wrong': 47, 'all': 47})})

In [99]:
models3 = {}
for k, m in models2.items():
    buck_res = m.copy()
    buck_res["is_country_gld"] = buck_res["wikidata_types_0905_gld"].apply(lambda x: is_country(x))
    buck_res["is_country_pred"] = buck_res["wikidata_types_0905_pred"].apply(lambda x: is_country(x))
    buck_res["is_team_gld"] = buck_res["gold_title"].apply(lambda x: is_team(x))
    buck_res["is_team_pred"] = buck_res["pred_title"].apply(lambda x: is_team(x))
    buck_res["is_subteam_gld"] = buck_res["gold_title"].apply(lambda x: is_team(x) & is_subteam(x))
    buck_res["is_subteam_pred"] = buck_res["pred_title"].apply(lambda x: is_team(x) & is_subteam(x))
    buck_res["sport_gld"] = buck_res["gold_title"].apply(lambda x: "NaN" if not is_team(x) else get_sport(x))
    buck_res["sport_pred"] = buck_res["pred_title"].apply(lambda x: "NaN" if not is_team(x) else get_sport(x))
    models3[k] = buck_res

# V2 Buckets

Country Signals (XXX is country)
1. XXX's Bobbie Joe
2. Naturalized by XXX
3. Teams of XXX, Clubs in XXX
4. Tour of XXX, Cup in XXX
5. Coach of XXX
6. XXXn cricketer/...

player from YYY??

Sport Signals (XXX is team)
1. Coach of XXX
2. played XXX, vs XXX, XXX won, won by XXX, victory over XXX, defeated by XXX, YYY against XXX, XXX qualified

In [25]:
def count_specific_text(phrase, before, leng, df):
    leng = max(2, leng)
    cnt = 0
    idxs = []
    for i, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
        span_l, span_r = row["span"]
        sentence = row["sentence_split"]
        if before:
            text = sentence[max(0, span_l-leng):span_l]
        else:
            text = sentence[span_r:span_r+leng]
        if len(phrase.intersection(text)) >= len(phrase):
            cnt += 1
            idxs.append(i)
    return cnt, idxs


input_phrase = {"match against"}
before = False
leng = 4
for_par = partial(count_specific_text, input_phrase, before, leng)
temp_df = train_res
c, indexes = for_par(temp_df)
print(c)

100%|██████████| 1919459/1919459 [03:03<00:00, 10475.16it/s]

0





In [63]:
def compute_ngrams_around(df, filt_func, n=2):
    n_grams_before, n_grams_after = defaultdict(int), defaultdict(int)
    for i, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
        if filt_func(row):
            left_text = row["left_ctx"]
            right_text = row["right_ctx"]
            if len(left_text) > 0:
                n_grams_before[" ".join(left_text)] += 1 
                n_grams_before[left_text[-1]] += 1 
            if len(right_text) > 0:
                n_grams_after[" ".join(right_text)] += 1
                n_grams_after[right_text[0]] += 1
    return n_grams_before, n_grams_after

def compute_ngrams(df, filt_func):
    n_grams = defaultdict(int)
    for i, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
        if filt_func(row):
            text = row["sentence_split"]
            text = [stemmer.stem(t) for t in text]
            for t in text:
                n_grams[t] += 1
    return n_grams

In [75]:
filt_f = lambda x: is_team(x["gold_title"])
n_grams_team = compute_ngrams(train_df, filt_f)
filt_f = lambda x: True
n_grams_all = compute_ngrams(train_df, filt_f)

100%|██████████| 1919459/1919459 [12:38<00:00, 2531.41it/s]


In [27]:
filt_f = lambda x: is_country(x["wikidata_types_0905_gld"])
n_grams_before_cntry, n_grams_after_cntry = compute_ngrams_around(train_df, filt_f)

100%|██████████| 1919459/1919459 [02:57<00:00, 10833.45it/s]


In [28]:
filt_f = lambda x: is_team(x["gold_title"])
n_grams_before_team, n_grams_after_team = compute_ngrams_around(train_df, filt_f)

100%|██████████| 1919459/1919459 [02:46<00:00, 11529.71it/s]


In [90]:
all_w_team = [k for k in sorted(n_grams_team.items(), key = lambda x: x[1], reverse=True) if k[0].lower() not in STOP_WORDS and k[0].lower() not in punc]
all_w = [k[0] for k in sorted(n_grams_all.items(), key = lambda x: x[1], reverse=True) if k[0].lower() not in STOP_WORDS and k[0].lower() not in punc]
all_w_f = [a for a in all_w_team if a[0] not in all_w[:1000]]
print(all_w_f[:50])
SPORT_WORDS_ADJ = SPORT_WORDS.union([a[0] for a in all_w_f[:50]])
with open("new_sport_words.txt", "w") as out_f:
    for w in SPORT_WORDS_ADJ:
        out_f.write(w + "\n")

[('euro', 1604), ('loss', 1589), ('1–0', 1324), ('beat', 1240), ('2–1', 1079), ('substitut', 1077), ('2–0', 1063), ('minut', 995), ('drawn', 878), ('lose', 813), ('semi-fin', 793), ('1–1', 787), ('icc', 787), ('3–0', 763), ('away', 760), ('under-21', 750), ('elimin', 679), ('penalti', 657), ('odi', 651), ('3–1', 602), ('quarter-fin', 587), ('trophi', 577), ('afc', 551), ('winner', 546), ('call-up', 531), ('fail', 501), ('elig', 492), ('replac', 491), ('0–0', 483), ('advanc', 474), ('onc', 449), ('concacaf', 447), ('play-off', 443), ('twice', 442), ('wicket', 439), ('alongsid', 436), ('twenty20', 435), ('copa', 425), ('despit', 422), ('under-19', 414), ('saw', 390), ('américa', 355), ('3–2', 354), ('d', 343), ('uae', 340), ('4–0', 338), ('refere', 337), ('2–2', 334), ('fixtur', 334), ('drew', 331)]


In [29]:
before = sorted(n_grams_before_cntry.items(), key = lambda x: x[1], reverse=True)
print(before[:50])
print()
after = sorted(n_grams_after_cntry.items(), key = lambda x: x[1], reverse=True)
print(after[:150])

[(',', 241014), ('the', 132955), ('in', 82511), ('and', 56614), ('a', 43071), ('of', 33982), ('to', 20053), ('from', 14420), (') is a', 11894), (') was a', 11679), ('is found in', 10508), ('eastern', 5173), ('(', 5093), ('southern', 4810), ('by', 4779), ('northern', 4714), ('with', 3720), ('western', 3616), (', in the', 3430), ('unit state and', 3284), ('unit state ,', 3246), ('as', 3203), ('in the', 3047), ('central', 3016), ('an', 2943), ('includ', 2910), (', and the', 2517), (', franc ,', 2427), (', germani ,', 2386), ('for', 2294), (', india ,', 2257), (', china ,', 2182), ('a villag in', 2181), ('northeastern', 2136), ('former', 2082), ('is endem to', 2074), ('between', 2051), (', itali ,', 2047), ('is known from', 1803), ('southeastern', 1798), (', japan ,', 1742), ('the netherland ,', 1716), (', brazil ,', 1654), (':', 1605), (', california ,', 1539), (', canada ,', 1527), ('south-eastern', 1500), (', colombia ,', 1459), (', illinoi ,', 1418), ('move to the', 1416)]

[('.', 3515

In [30]:
before = sorted(n_grams_before_team.items(), key = lambda x: x[1], reverse=True)
print(before[:150])
print()
after = sorted(n_grams_after_team.items(), key = lambda x: x[1], reverse=True)
print(after[:150])

[('the', 8186), ('against', 8176), (',', 5137), ('and', 4235), ('for', 2201), ('to', 1219), ('over', 1039), ('with', 951), ('friend match against', 763), ('of', 697), ('by', 495), ('defeat', 474), ('repres', 438), ('debut for the', 424), ('his debut for', 423), ('between', 412), ('beat', 379), ('a friend against', 369), ('intern debut for', 290), ('host', 278), ('match against the', 262), ('in', 261), ('member of the', 251), ('play for the', 248), ('cup qualifi against', 220), (':', 205), ('up to the', 204), ('senior', 187), ('champion', 186), ('unit state and', 177), ('qualif match against', 175), ('(', 172), ('cap for the', 163), ('qualifi match against', 160), ('former', 139), ('a match against', 133), ('friend against the', 128), ('face', 128), ('as', 125), ('who play for', 123), ('to play for', 113), ('the netherland and', 112), ('play', 112), ('his', 109), ('senior debut for', 105), ('both', 104), ('a', 104), ('nation team against', 103), ('versus', 99), ('part of the', 94), ('th

# WL Functions

In [100]:
quantifiers = {"women", "under-", "junior", "men"}
quantifiers_withoutmen = {"women", "under-", "junior"}

def is_country(types):
    return any("country" in t or "island nation" in t for t in types)

def is_team(title):
    t = title.lower()
    return "national" in t and "team" in t and "competition" not in t and "season" not in t and "cup" not in t and "national team nomenclature" not in t and "teamsters" not in t

def is_subteam(title):
    title = title.lower()
    return ("women" in title or "under-" in title or "junior" in title or "men" in title)

def get_sport(title):
    title = title.lower()
    words_to_remove = ["england", "national", "association", "team", "women's", "women", "under-15",
                       "under-21", "under-18", "under-17", "under-19", "beach", "olympic", "secondary", 
                       "international", "league", "union", "competition", "teams"]
    filt = [word for word in title.split() if word.lower() not in words_to_remove]
    sport_key = all_sports.intersection(filt)
    if len(sport_key) > 1:
        print("BAD", sport_key)
    elif len(sport_key) <= 0:
        if title == "national sports team":
            return "NaN"
        print("EVEN WORSE", filt, title)    
    return list(sport_key)[0]

def get_country(title):
    title = title
    words_to_remove = set(["inline", "sevens","ice", "field", "national", "association", "team", 
                       "men's", "men", "youth", "water", "a", "women's", "women", "under-23", 
                       "under-16", "under-21", "under-18", "under-17", "under-19", "under-20", 
                       "beach", "olympic", "secondary", "international", "league", "union", 
                       "competition", "teams", "boys'", "roller", "sitting", "wheelchair",
                       "junior", "flag", "blind", "b", "c", "indoor", "rules", "long", "track"])
    filt = [word for word in title.split() if word.lower() not in words_to_remove]
    filt = [word for word in filt if word.lower() not in all_sports]
    return " ".join(filt)

def remove_quantifiers(self, title, title2q):
    tit = " ".join([t for t in title.split() if not any(q in t.lower() for q in quantifiers)])
    if tit not in title2q:
        title2 = title.replace("women", "men")
        tit2 = " ".join([t for t in title2.split() if not any(q in t.lower() for q in quantifiers_withoutmen)])
        if tit2 not in title2q:
            return None
        else:
            return tit2
    else:
        return tit

def has_quantifier(self, title):
    title = title.lower()
    return ("women" in title or "under-" in title)

def find_closest_team(cand_names, sport=None):
    for c in cand_names:
        if is_team(c):
            if sport is None:
                return c
            else:
                if get_sport(c) == sport:
                    return c
    return None

In [101]:
# [('defeat', 474),('beat', 379), ('draw with', 205), ('play against', 150), ('team of', 142), ('former', 139),
#  ('play', 112), ('versus', 99), ('defeat the', 85), ('after defeat', 66),(', defeat', 61), ('defeat by', 61),('after beat', 56), ('cup against', 55),

SPORT_BEFORE_NOKEY = {'defend champion', 'match against', 'game against', 'qualifi against', 'win against', 'loss against',
                      'draw against', 'final against', 'defeat against', 'play for', 'team against', 'goal against', 'victori against'}
def lf_no_other_words(row):
    left_ctx = row["left_ctx"]
    right_ctx = row["right_ctx"]
    orig_label = row["gold_title"]
    orig_qid = row["gold_qid"]
    orig_alias = row["alias"]
    if not is_country(row["wikidata_types_0905_gld"]):
        return None, None, None
    elif " ".join(left_ctx[-2:]) not in SPORT_BEFORE_NOKEY:
        return None, None, None
    else:   
        other_titles = [q2title[q] for q in row["all_gold_qids"]]
        new_label = None
        for t in other_titles:
            if is_team(t):
                new_label = find_closest_team(row["cand_names"], get_sport(t))
                if new_label:
                    break
        if not new_label:
            new_label = find_closest_team(row["cand_names"])
        if not new_label:
            return None, None, None
        # return qid, title, alias
        return new_label, title2q[new_label], orig_alias

SPORT_BEFORE_2 = {'win over', 'loss to', 'victori over', 'draw with', 'won by', 'lose to', 'defeat by',
                    'defeat to', 'play against', 'match between', 'team against', "lost to", 'cup against'}
def lf_no_other_words_lessaccurate(row):
    left_ctx = row["left_ctx"]
    right_ctx = row["right_ctx"]
    orig_label = row["gold_title"]
    orig_qid = row["gold_qid"]
    orig_alias = row["alias"]
    if not is_country(row["wikidata_types_0905_gld"]): # or not row["has_sport_word"]:
        return None, None, None
    elif " ".join(left_ctx[-2:]) not in SPORT_BEFORE_2:
        return None, None, None
    else:   
        other_titles = [q2title[q] for q in row["all_gold_qids"]]
        new_label = None
        for t in other_titles:
            if is_team(t):
                new_label = find_closest_team(row["cand_names"], get_sport(t))
                if new_label:
                    break
        if not new_label:
            new_label = find_closest_team(row["cand_names"])
        if not new_label:
            return None, None, None
        # return qid, title, alias
        return new_label, title2q[new_label], orig_alias

SPORT_SINGLE_WORD = {"play", "beat", "defeat"}
def lf_single_word(row):
    left_ctx = row["left_ctx"]
    right_ctx = row["right_ctx"]
    orig_label = row["gold_title"]
    orig_qid = row["gold_qid"]
    orig_alias = row["alias"]
    if not is_country(row["wikidata_types_0905_gld"]):
        return None, None, None
    elif (len(left_ctx) <= 0 or left_ctx[-1] not in SPORT_SINGLE_WORD):
        return None, None, None
    else:   
        other_titles = [q2title[q] for q in row["all_gold_qids"]]
        new_label = None
        for t in other_titles:
            if is_team(t):
                new_label = find_closest_team(row["cand_names"], get_sport(t))
                if new_label:
                    break
        if not new_label:
            new_label = find_closest_team(row["cand_names"])
        if not new_label:
            return None, None, None
        # return qid, title, alias
        return new_label, title2q[new_label], orig_alias
    

In [102]:
def is_in_lf_sport_words(row):
    if " ".join(row["left_ctx"][-2:]) in SPORT_BEFORE_NOKEY:
        return True
    return False

def is_in_lf_no_other_words_lessaccurate(row):
    if " ".join(row["left_ctx"][-2:]) in SPORT_BEFORE_2:
        return True
    return False
        
def is_in_lf_sport_single_word(row):
    if len(row["left_ctx"]) > 0 and row["left_ctx"][-1] in SPORT_SINGLE_WORD:
        return True
    return False
    
def relabel_data(df, lfs):
    new_titles = []
    new_qids = []
    new_aliases = []
    metrics = defaultdict(int)
    row_indices = defaultdict(list)
    for i, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
        new_titles.append(row["gold_title"])
        new_qids.append(row["gold_qid"])
        new_aliases.append(row["alias"])
        labelled = False
        for lf in lfs:
            if labelled:
                break
            tit, qid, al = lf(row)
            if tit is not None:
                new_titles[-1] = tit
                new_qids[-1] = qid
                new_aliases[-1] = al
                metrics[lf.__name__] += 1
                row_indices[lf.__name__].append(i)
                labelled = True
    assert len(new_titles) == len(new_qids) == len(new_aliases) == df.shape[0]
    return new_titles, new_qids, new_aliases, metrics, row_indices

In [28]:
new_ts, new_qs, new_as, mets, row_is = relabel_data(train_res, [lf_no_other_words, lf_no_other_words_lessaccurate, lf_single_word])
print(mets)

100%|██████████| 1919459/1919459 [05:02<00:00, 6353.88it/s]

defaultdict(<class 'int'>, {'lf_single_word': 175, 'lf_no_other_words_lessaccurate': 172, 'lf_no_other_words': 232})





In [40]:
import copy
train_aug = copy.deepcopy(train_res)
train_aug["new_title"] = pd.DataFrame(data = {'new_title': new_ts})
train_aug["new_qid"] = pd.DataFrame(data = {'new_qid': new_qs})
train_aug["new_alias"] = pd.DataFrame(data = {'new_alias': new_as})
for lf_name in row_is:
    temp = train_aug.iloc[row_is[lf_name]]
    print(lf_name)
    display(temp.sample(10)[["sentence", "gold_title", "gold_qid", "new_title", "new_qid"]])
    temp.to_pickle(f"./sports_{lf_name}.pkl")
train_aug["gold_qid"] = train_aug['new_qid']
train_aug["gold_title"] = train_aug['new_title']
train_aug["alias"] = train_aug['new_alias']

lf_single_word


Unnamed: 0,sentence,gold_title,gold_qid,new_title,new_qid
820181,"Despite human rights concerns , Carter continued US support for Joseph Mobutu of Zaire , who defeated Angola n-backed insurgents in conflicts known as Shaba I and Shaba II .",Angola,Q916,Angola national football team,Q192828
116797,"This third-placed finish led to a European adventure in the UEFA Cup , beating German giants Hamburger SV and Hungarians Vasas Budapest before finally going out in Yugoslavia to NK Zeljeznicar Sarajevo .",Germany,Q183,Germany national football team,Q43310
110072,"In 2017 , in Lasko , Slovenia , Karabardak won his first ever European gold medal with David Wetherill and Martin Perry after defeating Croatia in the final .",Croatia,Q224,Croatia national football team,Q134479
974023,"The United States won the gold medal after beating Great Britain in the final 3–0 thanks to a hat-trick by Jake Rozhansky , while Israel won the bronze medal after beating Mexico in a Penalty shoot-out .",United Kingdom,Q145,Great Britain national rugby league team,Q3590223
116931,Egypt beat Jordan in the final to win the gold medal and Qatar won the third place play-off for the bronze .,Jordan,Q810,Jordan national football team,Q275940
1713821,"Georgia recovered from the shocking loss to the Philippines in the previous round to convincingly beat Estonia 3½-½ , in a match with only Lela Javakhishvili failing to win her game against Monika Tsõganova .",Estonia,Q191,Estonia national football team,Q186914
1712847,In the third round they defeated Sri Lanka 3–1 with Erandi Warusawithana and Ishara Madurangi being defeated 3–0 by Silva .,Sri Lanka,Q854,Sri Lanka national cricket team,Q203092
541487,"He won the bronze medal in the Épée competition at the 2008 Summer Olympics in Beijing , China , defeating Hungarian Gábor Boczkó .",Hungary,Q28,Hungary national football team,Q170561
341634,Mohun Bagan became the first Indian club to qualify to the second round of AFC Champions League qualifiers when they defeated Singapore based club Tampines Rovers on 27 January 2016 .,Singapore,Q334,Singapore national football team,Q239528
1114395,"He defeated British heavyweight boxer Danny Williams in Pabellon Municipal , Silla , Valencia , Comunidad Valenciana , Spain on 2 December 2011 .",United Kingdom,Q145,Great Britain national rugby league team,Q3590223


lf_no_other_words_lessaccurate


Unnamed: 0,sentence,gold_title,gold_qid,new_title,new_qid
1190173,"At the 2007 World Amateur Boxing Championships he beat Daniel Kooij , Christopher Downs and Ramazan Magomedov in the preliminaries , but lost to Kazakhstan southpaw Yerkebuian Shynaliyev in the quarters .",Kazakhstan,Q232,Kazakhstan national football team,Q183468
1900150,"It was won by Denmark , a non-member of the International Roll Ball Federation , and host India placed second .",Denmark,Q35,Denmark national football team,Q131785
567600,"The competition was won by German Bundesliga side 1 FFC Frankfurt , who beat Sweden 's Umeå IK by a score of 2–0 , in the single-leg 2002 UEFA Women 's Cup Final .",Germany,Q183,Germany national football team,Q43310
793720,The event was won by British heptathlon specialist Katarina Johnson-Thompson .,United Kingdom,Q145,Great Britain national rugby league team,Q3590223
1111713,"In two matches between Korea n archers and archers from Chinese Taipei , Yun Mi-Jin was the only Korean to lose so far in the women 's competition , falling to Yuan Shu Chi .",South Korea,Q884,South Korea men's national basketball team,Q496015
292419,He was part of the Indian team which finished fourth in the `` President Cup of Kazakhstan '' after losing to Kazakhstan in the bronze medal play off by 5–3 .,Kazakhstan,Q232,Kazakhstan national football team,Q183468
38600,"He won a silver medal in the men 's 100 freestyle S4 , losing to Slovenia 's Darko Đurić by half a third of a second with a time of 1:2756 to Duric 's 1:2727 .",Slovenia,Q215,Slovenia national football team,Q172221
1140667,The 1992 World Polo Championship was played in Santiago Chile during April 1992 and was won by Argentina .,Argentina,Q414,Argentina national football team,Q79800
1601736,"He scored in his first pre-season match for his new club , a 2–0 win over Qatar 's U-21 team , on 11 July 2007 .",Qatar,Q846,Qatar national football team,Q232731
692495,The eighth edition of the World Polo Championship took place in Mexico during May 2008 and was won by Chile .,Chile,Q298,Chile national football team,Q172025


lf_no_other_words


Unnamed: 0,sentence,gold_title,gold_qid,new_title,new_qid
81884,"He started his career with AGF Aarhus , with whom he won two Danish Cup s , and most notably went on to play for German clubs Hamburger SV and MSV Duisburg , as well as Bolton Wanderers in England .",Germany,Q183,Germany national football team,Q43310
1158365,Dludlu made her first appearance for the South Africa senior national team in 2006 during a match against Mozambique .,Mozambique,Q1029,Mozambique national football team,Q254231
1475019,"After playing for the Hungarian national side through various age groups , In November 2003 Bodor made his debut for the full Hungary National side in a 1–0 loss against Estonia .",Estonia,Q191,Estonia national football team,Q186914
1230912,"He completed his comeback by showing a bowling performance in the next match against Sri Lanka when he took the wicket of Kumar Sangakkara However , after the tournament he suffered a back stress fracture during training and missed the series against Bangladesh and India .",Sri Lanka,Q854,Sri Lanka national cricket team,Q203092
82516,He played two games against Mozambique .,Mozambique,Q1029,Mozambique national football team,Q254231
1466420,Kames also began playing for United States troops during the war .,United States,Q30,United States men's national soccer team,Q164134
181718,Ankersen scored his first goal ever in Danish national team colours on 11 November 2016 in a 2018 FIFA World Cup Qualification match against Kazakhstan .,Kazakhstan,Q232,Kazakhstan national football team,Q183468
1262989,"Abroad , he started by playing for Cyprus ' Ermis Aradippou FC and FC Brașov in Romania .",Cyprus,Q229,Cyprus national football team,Q188791
1350131,"On 6 August 2009 , he scored the first goal in a 3–0 second leg Europa League win against Slovenia n Interblock Ljubljana , winning 5–0 on aggregate .",Slovenia,Q215,Slovenia national football team,Q172221
866013,Jean-Paul Eale Lutula ( born 4 October 1984 ) is a Rwanda n football er who is currently playing for Qatar i club Muaither .,Qatar,Q846,Qatar national football team,Q232731


In [112]:
# Print scores over test data
for k in models3:
    print("\n\nK", k, models3[k].shape[0])
    temp = models3[k][models3[k].apply(lambda x: is_in_lf_sport_words(x), axis=1)]
    print("lf_sport_words")
    print("total,correct,pred_count,pred_count_corrct,gold_count,pred_team")
    # display(temp[temp["is_country_gld"]].head(5))
    print(temp.shape[0], temp[temp["pred_qid"] == temp["gold_qid"]].shape[0], temp[temp["is_country_pred"]].shape[0], temp[(temp["is_country_pred"]) & (temp["pred_qid"] == temp["gold_qid"])].shape[0], temp[temp["is_country_gld"]].shape[0], temp[temp["is_team_pred"]].shape[0])
    temp = models3[k][models3[k].apply(lambda x: is_in_lf_no_other_words_lessaccurate(x), axis=1)]
    print("lf_no_other_words_lessaccurate")
    print("total,correct,pred_count,pred_count_corrct,gold_count,pred_team")
    print(temp.shape[0], temp[temp["pred_qid"] == temp["gold_qid"]].shape[0], temp[temp["is_country_pred"]].shape[0], temp[(temp["is_country_pred"]) & (temp["pred_qid"] == temp["gold_qid"])].shape[0], temp[temp["is_country_gld"]].shape[0], temp[temp["is_team_pred"]].shape[0])
    temp = models3[k][models3[k].apply(lambda x: is_in_lf_sport_single_word(x), axis=1)]
    print("lf_sport_single_word")
    print("total,correct,pred_count,pred_count_corrct,gold_count,pred_team")
    print(temp.shape[0], temp[temp["pred_qid"] == temp["gold_qid"]].shape[0], temp[temp["is_country_pred"]].shape[0], temp[(temp["is_country_pred"]) & (temp["pred_qid"] == temp["gold_qid"])].shape[0], temp[temp["is_country_gld"]].shape[0], temp[temp["is_team_pred"]].shape[0])



K orig 160252
lf_sport_words
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
693 634 14 9 16 587
lf_no_other_words_lessaccurate
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
370 321 20 12 20 337
lf_sport_single_word
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
171 131 37 28 33 88


K aug 160252
lf_sport_words
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
693 624 16 9 16 585
lf_no_other_words_lessaccurate
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
370 319 23 13 20 335
lf_sport_single_word
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
171 127 30 23 33 95


K wl 160252
lf_sport_words
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
693 632 10 6 16 591
lf_no_other_words_lessaccurate
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
370 324 15 11 20 342
lf_sport_single_word
total,correct,pred_count,pred_count_corrct,gold_count,pred_team
171 132 25 21 3

# V1 Buckets

In [104]:
   
def compute_buckets_hlp(df):
    pred_c_gold_c = df[(df["is_country_gld"]) & (df["is_country_pred"])]
    pred_s_gold_c = df[(df["is_country_gld"]) & (df["is_team_pred"])]
    pred_s_gold_s = df[(df["is_team_gld"]) & (df["is_team_pred"])]
    pred_c_gold_s = df[(df["is_team_gld"]) & (df["is_country_pred"])]
    pred_s_gold_s_crc_s = df[(df["is_team_gld"]) & (df["is_team_pred"]) & (df["sport_gld"] == df["sport_pred"])]
    pred_s_gold_s_strict = pred_s_gold_s[(~pred_s_gold_s["is_subteam_gld"]) & (~pred_s_gold_s["is_subteam_pred"])]
    pred_ss_gold_s = pred_s_gold_s[(pred_s_gold_s["is_subteam_gld"]) & (~pred_s_gold_s["is_subteam_pred"])]
    pred_s_gold_ss = pred_s_gold_s[(pred_s_gold_s["is_subteam_pred"]) & (~pred_s_gold_s["is_subteam_gld"])]
    pred_ss_gold_ss = pred_s_gold_s[(pred_s_gold_s["is_subteam_gld"]) & (pred_s_gold_s["is_subteam_pred"])]
    print("SPORT vs TEAM")
    print(pred_c_gold_c.shape[0], pred_s_gold_c.shape[0], pred_c_gold_s.shape[0], pred_s_gold_s.shape[0])
    print(pred_c_gold_c[pred_c_gold_c["correct"]].shape[0], pred_s_gold_c[pred_s_gold_c["correct"]].shape[0], pred_c_gold_s[pred_c_gold_s["correct"]].shape[0], pred_s_gold_s[pred_s_gold_s["correct"]].shape[0])
    print(">>>Rate Fine", pred_s_gold_c.shape[0]/(pred_s_gold_c.shape[0]+pred_c_gold_c.shape[0]))
    print(">>>Rate Coarse", pred_c_gold_s.shape[0]/(pred_c_gold_s.shape[0]+pred_s_gold_s.shape[0]))
    print(">>>Correct Coarse", pred_c_gold_c[pred_c_gold_c["correct"]].shape[0]/pred_c_gold_c.shape[0])
    print(">>>Correct Fine", pred_s_gold_s[pred_s_gold_s["correct"]].shape[0]/pred_s_gold_s.shape[0])

    print("SPORTS vs SUBSPORT BUCKET")
    print(pred_s_gold_s_strict.shape[0], pred_ss_gold_s.shape[0], pred_s_gold_ss.shape[0], pred_ss_gold_ss.shape[0])
    print(pred_s_gold_s_strict[pred_s_gold_s_strict["correct"]].shape[0], pred_ss_gold_s[pred_ss_gold_s["correct"]].shape[0], pred_s_gold_ss[pred_s_gold_ss["correct"]].shape[0], pred_ss_gold_ss[pred_ss_gold_ss["correct"]].shape[0])
    print(">>>Rate Fine", pred_ss_gold_s.shape[0]/(pred_ss_gold_s.shape[0]+pred_s_gold_s_strict.shape[0]))
    print(">>>Rate Coarse", pred_s_gold_ss.shape[0]/(pred_s_gold_ss.shape[0]+pred_ss_gold_ss.shape[0]))
    print(">>>Correct Coarse", pred_s_gold_s_strict[pred_s_gold_s_strict["correct"]].shape[0]/pred_s_gold_s_strict.shape[0])
    print(">>>Correct Fine", pred_ss_gold_ss[pred_ss_gold_ss["correct"]].shape[0]/pred_ss_gold_ss.shape[0])
    

def compute_buckets(df):
    compute_buckets_hlp(df)
    df2 = df[df["has_sport_word"]]
    print("**********CONDITIONED ON SPORTS WORD")
    compute_buckets_hlp(df2)

In [105]:
for k in models3:
    print("\nK", k)
    compute_buckets(models3[k])


K orig
SPORT vs TEAM
90885 201 216 4057
89771 0 0 3592
>>>Rate Fine 0.002206705750609314
>>>Rate Coarse 0.05054996489585771
>>>Correct Coarse 0.9877427518292348
>>>Correct Fine 0.8853832881439487
SPORTS vs SUBSPORT BUCKET
2916 221 62 858
2842 0 0 750
>>>Rate Fine 0.0704494740197641
>>>Rate Coarse 0.06739130434782609
>>>Correct Coarse 0.9746227709190672
>>>Correct Fine 0.8741258741258742
**********CONDITIONED ON SPORTS WORD
SPORT vs TEAM
15225 154 151 3393
15015 0 0 3022
>>>Rate Fine 0.010013654984069186
>>>Rate Coarse 0.042607223476297966
>>>Correct Coarse 0.9862068965517241
>>>Correct Fine 0.8906572354848217
SPORTS vs SUBSPORT BUCKET
2420 174 45 754
2363 0 0 659
>>>Rate Fine 0.06707787201233616
>>>Rate Coarse 0.056320400500625784
>>>Correct Coarse 0.9764462809917356
>>>Correct Fine 0.8740053050397878

K aug
SPORT vs TEAM
90670 259 165 4113
89614 0 0 3625
>>>Rate Fine 0.002848376205610971
>>>Rate Coarse 0.038569424964936885
>>>Correct Coarse 0.9883533693614205
>>>Correct Fine 0.881351

# Writing Out New Training Data

In [None]:
import random
from collections import defaultdict

def init_out_dict(row):
    return {"sentence": row["sentence"] + "", "sent_idx_unq": int(row["sent_idx"]), "aliases": [], "spans": [], "qids": [], "gold": []}

def save_a2q(a2q, file):
    with open(file, "w") as out_f:
        json.save(a2q, out_f)

def write_df_to_jsonl(df, file, a2q):
    written_sents = set()
    with open(file, "w") as out_f:
        sent_idx = df.iloc[0]["sent_idx"]
        out_d = init_out_dict(df.iloc[0])
        for i, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
            if row["sent_idx"] != sent_idx:
                out_f.write(json.dumps(out_d) + "\n")
                written_sents.add(sent_idx)
                
                out_d = init_out_dict(row)
                assert row["sent_idx"] not in written_sents, f"ROW: {row}"
                sent_idx = row["sent_idx"]
            
            # Update sentence
            out_d["aliases"].append(row["alias"])
            out_d["spans"].append(list(row["span"]))
            out_d["qids"].append(row["gold_qid"])
            out_d["gold"].append(row["is_gold_label"])
            
            # Update a2q if needed
            if row["alias"] not in a2q:
                cands_qid = [title2q[c] for c in row["cand_names"]]
                cands_scores = [100-i for i in range(len(cands_qid))]
                a2q[row["alias"]] = list(zip(cands_qid, cands_scores))
        out_f.write(json.dumps(out_d) + "\n")
        written_sents.add(sent_idx)
    return written_sents, a2q
            

def augment_candidate_maps(df):
    final_new_cands = []
    final_new_aliases = []

    new_alias_idx = 0
    metrics = defaultdict(int)
    sent_idxs = set()
    for i, row in tqdm(df.iterrows(), total=df.shape[0], position=0, leave=True):
        cand_names = [n for n in row["cand_names"]]
        cand_qids = [title2q[c] for c in cand_names]
        final_new_cands.append(cand_names)
        final_new_aliases.append(row['alias'])
        if is_team(row["gold_title"]):
            if not any([is_country(types_wd.get_types(q)) for q in cand_qids]):
                sent_idxs.add(i)
                metrics["total"] += 1
                gold_qid = row["gold_qid"]
                if gold_qid in qids_pairs:
                    other_qid = list(qids_pairs[gold_qid])[0]
                    if other_qid not in q2title:
                        metrics["not_country"] += 1
                        continue
                    country_cand = q2title[other_qid]
                else:
                    temp = get_country(row["gold_title"])
                    if temp not in title2q:
                        # print("BAD", temp, "VS", row["gold_title"])
                        metrics["bad_title"] += 1
                        continue
                    country_cand = temp
                # sometimes the country_cand isn't actually a country so it passed our "not any()" filter above 
                if country_cand in cand_names:
                    metrics["not_country"] += 1
                    continue
                if len(cand_qids) < 30:
                    cand_names.append(country_cand)
                else:
                    # Randomly remove another candidate as long as it's not the gold qid
                    idxs = list(range(len(cand_qids)))
                    to_swap = random.sample(idxs, 1)[0]
                    while cand_names[to_swap] == row["gold_title"]:
                        to_swap = random.sample(idxs, 1)[0]
                    cand_names[to_swap] = country_cand
                assert len(set(cand_names)) == len(cand_names)
                final_new_cands[-1] = cand_names
                final_new_aliases[-1] = f"alias_{row['sent_idx']}_{new_alias_idx}"
                new_alias_idx += 1
                metrics["added"] += 1


    print(new_alias_idx, len(sent_idxs), metrics)
    assert len(final_new_cands) == len(final_new_aliases) == df.shape[0]
    return final_new_cands, final_new_aliases

In [None]:
# import copy
# train_aug = copy.deepcopy(train_df)
# final_new_cands, final_new_aliases = augment_candidate_maps(train_aug)

In [None]:
# train_aug["new_cand_names"] = pd.DataFrame(data = {'new_cand_names': final_new_cands})
# train_aug["new_alias"] = pd.DataFrame(data = {'new_alias': final_new_aliases})
# train_aug["alias"] = train_aug['new_alias']
# train_aug["cand_names"] = train_aug['new_cand_names']

In [41]:
train_aug = train_aug.sort_values(by=["sent_idx", "alias_idx"])

In [42]:
written_s, new_a2q = write_df_to_jsonl(train_aug, input_dir / "train_test.jsonl", copy.deepcopy(a2q_orig))
# save_a2q(new_a2q, input_dir / "entity_db" / "entity_mappings" / "alias2qid_aug.json")
print(len(new_a2q), len(a2q_orig))

100%|██████████| 1919459/1919459 [03:47<00:00, 8439.40it/s]


2088063 2088063


# Junk

In [106]:
models3["wl"].to_pickle("./sports_notitle_1229_wl_ep5_newsport.pkl")
models3["aug"].to_pickle("./sports_notitle_1229_aug_ep5_newsport.pkl")
models3["orig"].to_pickle("./sports_notitle_1229_orig_ep5_newsport.pkl")

In [None]:
n = 2
for i, row in tqdm(train_df.iterrows(), total=train_df.shape[0], position=0, leave=True):
    if is_team(row["gold_title"]):
        span_l, span_r = row["span"]
        split_text = row["sentence"].split()
        left_text, right_text = split_text[max(0, span_l-n):span_l], split_text[span_r:span_l+n]
        left_text = [stemmer.stem(t) for t in left_text]
        right_text = [stemmer.stem(t) for t in right_text]
        if len(left_text) > 0:
            if " ".join(left_text) == "in":
                print(row["sentence"], row["alias"], row["span"])

In [9]:
new_d = {}
with open(input_dir / "train_test.jsonl", "r") as in_f:
    for line in tqdm(in_f):
        d = json.loads(line)
        assert d["sent_idx_unq"] not in new_d
        new_d[d["sent_idx_unq"]] = d

old_d = {}
with open(input_dir / "train.jsonl", "r") as in_f:
    for line in tqdm(in_f):
        d = json.loads(line)
        assert d["sent_idx_unq"] not in old_d
        old_d[d["sent_idx_unq"]] = d    

497564it [00:16, 29553.83it/s] 
497565it [00:23, 21251.37it/s]


In [52]:
temp = models3["orig"]
display(temp[~temp["correct"]][single_cols].sample(20))

Unnamed: 0,sentence,sent_idx,alias,gold_title,pred_title,qid_cnt,wikidata_types_0905_gld,kg_relation_types_0905_gld,kg_adj_0905_gld,wikidata_types_0905_pred,kg_relation_types_0905_pred,kg_adj_0905_pred,cands
145298,17 in the United States and Sweden .,376298,united states,Billboard Hot 100,United States,208,[record chart],"[publisher, topic's main category, country]",[],"[country, sovereign state, constitutional republic]","[seal description, highest judicial authority, central bank, motto, railway traffic side, official symbol, driving side, top-level Internet domain, lowest point, office held by head of state, foundational text, electrical plug type, emergency phone number, Wikimedia outline, main regulatory text, geography of topic, anthem, category for films shot at this location, head of state, executive body, public holiday, history of topic, basic form of government, topic's main Wikimedia portal, curren...",[Q34],"[(United States, 0.9997671843), (United States Navy, 1.5198e-06), (United States Army, 6.63632e-05), (Billboard Hot 100, 5.62481e-05), (Billboard 200, 5.45347e-05), (United States Air Force, 9.411e-07), (National Collegiate Athletic Association, 2.037e-07), (Americans, 1.615e-07), (Native Americans in the United States, 3.217e-07), (United States dollar, 1.636e-07), (NASA, 2.152e-07), (Recording Industry Association of America, 5.7798e-06), (Major League Soccer, 2.3429e-06), (Federal governm..."
57118,Omanthai is a little town in North Sri Lanka n district of Vavuniya .,148185,vavuniya,Vavuniya District,Vavuniya,7,[district of Sri Lanka],"[category for maps, category of associated people, language used, topic's main category, located in the administrative territorial entity, country]","[Q7089582, Q854]",[human settlement],"[category for maps, category of associated people, topic's main category, located in time zone, located in the administrative territorial entity, country]",[Q854],"[(Vavuniya, 0.9950193167), (Vavuniya District, 0.0049801408), (Vavuniya Electoral District, 4.253e-07)]"
67438,"Operated by the Cantonese , the Great Southern Hotel catered more to Chinese travellers , including celebrities from Hong Kong and China .",174793,cantonese,Cantonese people,Cantonese,13,[ethnic group],[topic's main category],[],"[language, dialect, Yuehai dialects]","[dialect of, P134, related category, indigenous to, writing system, subclass of, topic's main category, located in the administrative territorial entity, country]","[Q8646, Q148]","[(Guangdong, 0.0849142894), (Guangzhou, 0.0209850743), (Cantonese, 0.7906708717), (Cantonese people, 0.0799442977), (Cantonese cuisine, 0.0058273575), (Yue Chinese, 0.0076267505), (Hong Kong Cantonese, 0.0003305677), (Lingnan culture, 0.0055067651), (Written Cantonese, 0.0024763609), (Yuehai Yue, 0.0017175111), (Without You (Karen Mok album), 7.03e-08)]"
86677,"He first appeared in Buenos Aires ' vibrant theatre scene in 1995 and became a prolific stage actor , notably in a 1998 local production of German playwright Frank Wedekind 's `` Spring Awakening `` , and in a compressed Shakespeare production from 2004 to 2006 , which earned him Argentine ACE and Clarín Awards .",225259,theatre,Culture of Argentina,Play (theatre),0,[culture of an area],"[topic's main category, country]",[],"[drama, performance work, literary form]","[category for eponymous categories, model item, topic's main template, use, part of, subclass of, topic's main category]",[Q2549073],"[(Broadway theatre, 0.0052927393), (Play (theatre), 0.970723927), (Theatre, 0.0177563149), (Theatre director, 2.51676e-05), (Young Vic, 3.77e-08), (Theater (structure), 0.0002357829), (English Renaissance theatre, 8.9148e-06), (Theater (warfare), 2.9965e-06), (Theatre of ancient Greece, 0.0004604245), (Roman theatre (structure), 0.0001119274), (Theatre of India, 0.0010025762), (Marathi theatre, 1.6169e-05), (The Theatre, 1.2037e-06), (Experimental theatre, 0.0013631952), (Theatre Royal, Edin..."
70886,"In 1795 , after the Battle of Muizenberg in present-day Cape Town , the British occupied the colony .",183520,british,Kingdom of Great Britain,British Empire,931,"[historical country, kingdom, sovereign state]","[basic form of government, currency, legislative body, flag, coat of arms, official language, replaced by, replaces, continent, language used, capital, different from, contains administrative territorial entity, topic's main category, participant in]",[],[Empire on which the sun never sets],"[basic form of government, currency, flag, coat of arms, official language, continent, category of associated people, on focus list of Wikimedia project, capital, has part, described by source, topic's main category, participant in, shares border with, country]",[],"[(United Kingdom, 0.1777659059), (England, 0.000473031), (English language, 2.0143e-06), (Royal Navy, 8.3591e-05), (British Army, 0.0009100796), (Royal Air Force, 7.03e-08), (Great Britain, 0.0371577144), (English people, 4.84213e-05), (Parliament of the United Kingdom, 1.1897e-06), (British Empire, 0.4037128091), (East India Company, 0.0001168876), (British Raj, 0.0029415914), (Commonwealth of Nations, 3.8508e-06), (British Museum, 2.351e-07), (Mandatory Palestine, 1.12877e-05), (Presidenci..."
85270,"A number of groups , including the Canaanites , the Israelites ( who later became the Jews ) , the Babylonians , Persians , Greeks , Jews , Romans , Byzantines , Umayyads , Abbasids , Seljuk Turks , Crusaders , Mamluks , Ottomans , the British , Israelis , Jordanians , and Egyptians have controlled the region at one time or another .",221892,persians,Achaemenid Empire,Qajar dynasty,19,"[historical country, periodization, Persian Empire]","[office held by head of state, history of topic, basic form of government, currency, official language, language used, capital, named after, religion, part of, followed by, follows, described by source, topic's main category, shares border with]",[],[dynasty],"[different from, described by source, topic's main category, country]",[],"[(Iran, 0.199940294), (Sasanian Empire, 0.0108312834), (Achaemenid Empire, 0.1050380766), (Persians, 0.0504441522), (Qajar dynasty, 0.4242459536), (Safavid dynasty, 0.0663004145), (Persian Empire, 0.1190562174), (History of Iran, 0.005398728), (The Persians, 1.17873e-05), (History of Persian Egypt, 0.0187331904)]"
58814,"He served with distinction throughout Alaungpaya 's reunification campaigns , which by 1758 had reunited all of Burma , conquered Manipur , and driven out the French from Thanlyin and the British from Negrais .",152396,british,British Empire,United Kingdom,1456,[Empire on which the sun never sets],"[basic form of government, currency, flag, coat of arms, official language, continent, category of associated people, on focus list of Wikimedia project, capital, has part, described by source, topic's main category, participant in, shares border with, country]",[],"[country, sovereign state, Commonwealth realm]","[seal description, highest judicial authority, central bank, railway traffic side, driving side, top-level Internet domain, lowest point, office held by head of state, electrical plug type, emergency phone number, Wikimedia outline, geography of topic, anthem, category for films shot at this location, head of state, executive body, public holiday, history of topic, basic form of government, topic's main Wikimedia portal, currency, culture, legislative body, permanent duplicated item, highest...","[Q142, Q145]","[(United Kingdom, 0.689457953), (England, 0.0004012189), (English language, 1.00332e-05), (Royal Navy, 0.0001488081), (British Army, 1.47675e-05), (Royal Air Force, 2.755e-07), (Great Britain, 0.0898171514), (English people, 2.49146e-05), (Parliament of the United Kingdom, 1.59187e-05), (British Empire, 0.1456243545), (East India Company, 0.0056970562), (British Raj, 0.0082297772), (Commonwealth of Nations, 9.019e-07), (British Museum, 6.72e-07), (Mandatory Palestine, 3.2703e-06), (Presidenc..."
69739,Paul Campbell Robertson Bayvel ( 28 March 1949 – 14 April 2020 ) was a South African rugby union player who played for the national team as a scrum-half .,180644,national team,South Africa national rugby union team,Ireland national rugby union team,22,[national rugby union team],"[kit supplier, authority, head coach, topic's main category, sport, country]",[],[national rugby union team],"[kit supplier, authority, operating area, head coach, home venue, location, topic's main category, sport, country]",[],"[(England national football team, 0.0047039706), (Scotland national football team, 0.0022288929), (New Zealand national rugby union team, 0.1116184518), (Germany national football team, 0.0025992836), (Italy national football team, 0.0041873385), (Brazil national football team, 0.0025310554), (United States men's national soccer team, 0.0029286104), (Australia national cricket team, 0.0151709458), (France national football team, 0.003310116), (Spain national football team, 0.00470882), (Arge..."
106977,"Taiwan president Ma and China president Xi leaders met on Saturday , November 7 , 2015 in Singapore .",277407,xi,Xi Jinping,Kumo Xi,67,"[politician, lawyer, philosopher]","[list of works, ancestral home, authority, permanent duplicated item, hair color, eye color, academic degree, writing language, native language, significant event, notable work, ethnic group, military rank, mother, residence, owner of, military branch, field of work, spouse, father, work location, sibling, member of, child, political party, employer, described by source, languages spoken, written or signed, topic's main category, participant in, award received, position held, educated at, fa...",[Q148],[people],[],[],"[(Pope Pius XI, 0.0001174809), (Xi Jinping, 0.0107992627), (Louis XI of France, 5.31111e-05), (Pope Innocent XI, 1.713e-07), (Pope Clement XI, 3.009e-07), (Zhu Xi, 1.684e-07), (Pope Gregory XI, 1.473e-07), (Davao Region, 0.001408443), (Super Bowl XI, 4.06e-08), (XI Corps (Union Army), 2.0562e-06), (Kang Youwei, 3.693e-07), (King Wen of Zhou, 1.3567e-06), (Kumo Xi, 0.9643091559), (Xi River, 1.4877e-05), (Pope Benedict XI, 3.6748e-06), (11 (number), 0.0226471052), (Pope Leo XI, 2.633e-07), (PB..."
194732,"The Honors Tutorial College ( HTC ) at Ohio University in Athens , Ohio is a college in the United States with a degree -granting program incorporating all the essential features of the traditional British constituent college tutorial system of undergraduate education developed over centuries at Oxford and Cambridge .",503481,degree,Academic degree,Bachelor's degree,10,[academic title],"[different from, subclass of, topic's main category]",[],"[academic degree, academic title, undergraduate degree]","[applies to jurisdiction, said to be the same as, different from, subclass of, follows, described by source, topic's main category]","[Q30, Q145]","[(Bachelor of Arts, 0.0662042499), (Bachelor of Science, 0.0318484418), (Master's degree, 0.0253209881), (Bachelor's degree, 0.7269259095), (Honorary degree, 0.0515244268), (Academic degree, 0.0481036082), (Degree (angle), 0.0024977096), (Undergraduate degree, 0.0160487257), (Degree (graph theory), 0.0005351437), (Degree of a polynomial, 0.0208647568), (Degree (music), 0.0004936435), (Rexona, 3.9e-08), (Comparison (grammar), 4.7285e-06), (Degree of a continuous mapping, 0.0006926383), (Degre..."


In [110]:
print("G:C, P:T", (254-201)/201)
print("G:T, P:C", (216-161)/216)

print("G:C, P:T", (190-154)/154)
print("G:T, P:C", (151-106)/151)

G:C, P:T 0.263681592039801
G:T, P:C 0.25462962962962965
G:C, P:T 0.23376623376623376
G:T, P:C 0.2980132450331126
