### Load all examples
 * `generate_uids=True`: return UIDs per example
 * `tokenizer=None`: return raw (untokenized) examples

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from utils import *
pd.set_option('display.max_colwidth', -1)

In [None]:
from metal.mmtl.glue.glue_preprocess import load_tsv, get_task_tsv_config

NOTE: In this case, it is easier to operate over the raw sentences and labels.

In [None]:
config = get_task_tsv_config('COLA', 'dev')
    
(sentences, labels), uids = load_tsv(
    tsv_path=config["tsv_path"],
    sent1_idx=config["sent1_idx"],
    sent2_idx=config["sent2_idx"],
    label_idx=config["label_idx"],
    skip_rows=config["skip_rows"],
    delimiter="\t",
    label_fn=config["label_fn"],
    generate_uids=True
)

assert len(sentences) == len(labels) == len(uids)

### Define Proper Nouns based on Entities
Ref: https://spacy.io/api/annotation#named-entities

In [None]:
import spacy
nlp = spacy.load('xx_ent_wiki_sm')

def get_entities(sent, entities):
    if sent is None:
        return []

    return [ent for ent in nlp(sent).ents if ent.label_ in entities]

def ex_has_entities(ex, entities=["PER", "ORG", "LOC"]):
    # process sentence 1
    proper_nouns = get_entities(ex[0], entities=entities)

    if len(ex) == 2:
        # process sentence 2
        proper_nouns += get_entities(ex[0], entities=entities)

    return len(proper_nouns) > 0

### Tag all examples in slice

In [None]:
from tagger import Tagger
tagger = Tagger(verbose=False)

In [None]:
slice_name = 'proper_nouns'
in_slice_fn = ex_has_entities

In [None]:
for idx, (ex, label, uid) in enumerate(zip(sentences, labels, uids)):   
    
    in_slice = in_slice_fn(ex)
    
    # logging for sanity check
    if idx % 1000 == 0:
        print((uid, ex, label), 'in_slice:', in_slice)
        print()  

    # if there are "proper nouns" as defined by entites, add the tag!
    if in_slice:
        tagger.add_tag(uid, slice_name)

In [None]:
num_in_slice = len(tagger.get_uids(slice_name))
num_ex = len(sentences)
print(f"% in slice ({num_in_slice}/{num_ex}) {num_in_slice/num_ex}")

In [None]:
tagger.get_examples(slice_name)

### Eval on slices with Uncased model

In [None]:
task_name = 'COLA'
model_path = '/dfs/scratch0/mccreery/mmtl/logs/ST_bertlarge/COLA/logdir/2019_02_25/COLA_21_56_02/best_model.pth'
split = 'dev'
bert_model = "bert-large-uncased"
max_len = 200
dl_kwargs = {"shuffle": False, "batch_size":1}

tasks, payloads = create_glue_tasks_payloads(
    task_names=[task_name],
    bert_model=bert_model,
    max_len=max_len,
    dl_kwargs=dl_kwargs,
    splits=[split],
    max_datapoints=-1,
    generate_uids=True,
)

model = MetalModel(tasks, verbose=False, device=0)
dl = payloads[0].data_loader

In [None]:
from metal.mmtl.debugging.utils import load_data_and_model, create_dataframe

# Load model and data
model_path = '/dfs/scratch0/mccreery/mmtl/logs/ST_bertlarge/COLA/logdir/2019_02_25/COLA_21_56_02/'
task_name = 'COLA'
split = 'dev'
bert_model = "bert-large-uncased"
model, dl = load_data_and_model(model_path, [task_name], split, bert_model=bert_model)

In [None]:
filepath = f'{task_name}_{bert_model}_{split}_error_analysis.tsv'

# Create DataFrame of Raw Data, Predictions, and Labels
print('Creating dataframe')
df_uncased = create_dataframe(task_name, model, dl, bert_model=bert_model)
print('Created dataframe')

# Save (and reload) DataFrame
save_dataframe(df_uncased, filepath)

In [None]:
df_uncased = load_dataframe(filepath)

In [None]:
def df_for_uids(df, uids):
    mask = df['uid'].apply(lambda x: x in uids)
    return df[mask]

In [None]:
slice_uids = tagger.get_uids(slice_name)
df_uncased_in_slice = df_for_uids(df_uncased, slice_uids)

In [None]:
df_uncased_in_slice[df_uncased_in_slice['is_wrong']].head(20)

In [None]:
print ("Error rate:", len(df_uncased_in_slice[df_uncased_in_slice['is_wrong']])
                                              / len(df_uncased_in_slice))

## Eval on Slice with Cased Model

In [None]:
from metal.mmtl.debugging.utils import load_data_and_model, create_dataframe

# Load model and data
model_path = '/dfs/scratch0/mccreery/mmtl/logs/ST_bertlarge/COLA_cased/2/logdir/2019_03_05/COLA_00_50_04/best_model.pth'
task_name = 'COLA'
split = 'dev'
bert_model = "bert-large-cased"
model,dl = load_data_and_model(model_path, [task_name], split, bert_model=bert_model)

In [None]:
filepath = f'{task_name}_{bert_model}_{split}_error_analysis.tsv'

# Create DataFrame of Raw Data, Predictions, and Labels
print('Creating dataframe')
df_cased = create_dataframe(task_name, model, dl, bert_model=bert_model)
print('Created dataframe')

# Save (and reload) DataFrame
save_dataframe(df_cased, filepath)

In [None]:
df_cased = load_dataframe(filepath)

In [None]:
df_cased_in_slice = df_for_uids(df_cased, slice_uids)

In [None]:
df_cased_in_slice[df_cased_in_slice['is_wrong']].head()

In [None]:
print ("Error rate:", len(df_cased_in_slice[df_cased_in_slice['is_wrong']])
                                              / len(df_cased_in_slice))

## Error analysis on differences in predictions!

In [None]:
df_uncased.head()

In [None]:
df_cased.head()

In [None]:
# retrieve list uids corresponding to INCORRECT ex in UNCASED
incorrect_uncased_uids = df_uncased_in_slice[df_uncased_in_slice['is_wrong']]['uid'].to_list()

incorrect_cased_uids = df_cased_in_slice[df_cased_in_slice['is_wrong']]['uid'].to_list()

# retrieved list of uids corresponding to CORRECT ex in CASED model
correct_cased_uids = df_cased_in_slice[~df_cased_in_slice['is_wrong']]['uid'].to_list()

### Which examples were "corrected" by the casing model?

In [None]:
corrected_by_casing_uids = set(incorrect_uncased_uids).intersection(set(correct_cased_uids))
df_for_uids(df_uncased, corrected_by_casing_uids).head(20)

In [None]:
df_for_uids(df_cased, corrected_by_casing_uids).head(20)

### Which examples are "still incorrect" with the casing model? 

In [None]:
still_incorrect_uids = set(incorrect_uncased_uids).intersection(set(incorrect_cased_uids))
df_for_uids(df_cased, still_incorrect_uids).head(20)