# Mask prediction approach

These are the additional resources you will need, to run this script:
* **CONTEMPORARY BERT MODEL:** (optional) Download it from https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz, unzip it and save in `../models/language_models/bert_models/`
* **HISTORICAL BERT MODELS:** Coming soon.
* **SENTENCES TRANSFORMERS BERT:** Download it from https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/bert-base-nli-mean-tokens.zip, unzip it and save in `../models/language_models/bert_models/`

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer,BertForMaskedLM
from sentence_transformers import SentenceTransformer
from itertools import chain
import pandas as pd
import numpy as np
import spacy
import operator
from pathlib import Path
from scipy import stats
from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from tools import animacy_detection,animacy_evaluation,processing
import unidecode
from collections import Counter
import sklearn
from tqdm import tqdm_notebook as tqdm

In [None]:
overwrite = True

# -------------------------------------------------------
# Choose scenario:

prediction_approach = "bert_masking"
# Options:
# * "bert_masking" (string):
# * "wemb_baseline" (string):

corpus = "machines19thC/"
# Options:
# * "stories" (string):
# * "machines19thC" (string):

context = "both"
# Options:
# * "both" (string):
# * "prev" (string):
# * "next" (string):
# * "sent" (string):

time_period = "contemporary"
# Options:
# * "contemporary" (string):
# * "before1850" (string):
# * "from1850to1875" (string):
# * "from1875to1890" (string):
# * "from1890to1900" (string):
# * "timeSensitive" (string):

weighted = True
# Options:
# * True (boolean): 
# * False (boolean): 

scenario = "animacy"
# Options:
# * "animacy" (string):
# * "humanness" (string):
# * "" (string):

wsd = "bert"
# Options:
# * "False" (string):
# * "bert" (string):

words_cutoff = 250
# Words cutoff (integer): number of predictions for a given MASK

In [None]:
# -------------------------------------------------------
# Instantiate default values:
scenario = "all_" if corpus == "stories/" else scenario + "_all_"
time_period = "contemporary" if corpus == "stories/" else time_period # If corpus is "stories", only "contemporary" Bert is meaningful
language_model = None

bert_models = {"contemporary": BertForMaskedLM.from_pretrained('bert-base-uncased')
               , "before1850": BertForMaskedLM.from_pretrained("../models/language_models/bert_models/FT_bert_base_uncased_before_1850")
               , "from1850to1875": BertForMaskedLM.from_pretrained("../models/language_models/bert_models/FT_bert_base_uncased_after_1850_before_1875")
               , "from1875to1890": BertForMaskedLM.from_pretrained("../models/language_models/bert_models/FT_bert_base_uncased_after_1875_before_1890")
               , "from1890to1900": BertForMaskedLM.from_pretrained("../models/language_models/bert_models/FT_bert_base_uncased_after_1890_before_1900")
              }

tokenizer = None
if prediction_approach == "bert_masking":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # BERT tokenizer is always the same
    
if prediction_approach == "wemb_baseline":
    language_model = FastText.load_fasttext_format('../models/language_models/fastai/cc.en.300.bin')

wsdmodel = None
if wsd == "bert":
    wsdmodel = SentenceTransformer('../models/language_models/bert_models/bert-base-nli-mean-tokens')

# Load datasets:
dataset_testdf = pd.read_pickle("../data/" + corpus + scenario + "test" + ".pkl")
dataset_traindf = pd.read_pickle("../data/" + corpus + scenario + "train" + ".pkl")

In [None]:
exp_train_path = "../experiments/" + corpus + scenario + "train_" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_" + time_period + ".pkl"
exp_test_path = "../experiments/" + corpus + scenario + "test_" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_" + time_period + ".pkl"

In [None]:
animacy_detection.predict_mask_animacy(exp_train_path, dataset_traindf, context, words_cutoff, prediction_approach, wsd, wsdmodel, tokenizer, language_model, bert_models, time_period, overwrite)
animacy_detection.predict_mask_animacy(exp_test_path, dataset_testdf, context, words_cutoff, prediction_approach, wsd, wsdmodel, tokenizer, language_model, bert_models, time_period, overwrite)

In [None]:
dataset_train_df = pd.read_pickle(exp_train_path)

In [None]:
dataset_train_df.head()

In [None]:
# -------------------------------------------------------
# Find the best cutoff and threshold based on training set
cutoff_list = []
threshold_list = []

dataset_train_df = pd.read_pickle(exp_train_path)
cutoff_list = [1, 2, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 250]
threshold_list = list(np.arange(0, 1.05, 0.1))

df_results_train = pd.DataFrame(columns = ['threshold', 'cutoff', 'precision', 'recall', 'fscore', 'micro_fscore', 'map'])
for threshold in threshold_list:
    print(threshold)
    for exp_cutoff in cutoff_list:
        setting = (round(threshold,2),exp_cutoff)
        predicted = dataset_train_df['predicted'].tolist()
        scores = dataset_train_df['scores'].tolist()

        y_pred = [animacy_evaluation.animacy_score(predicted[x], scores[x],weighted,exp_cutoff) for x in range(len(predicted))]
        y_true = [x for x in dataset_train_df['animated'].tolist()]

        precision, recall, fscore, micro_fscore,map_ = animacy_evaluation.results(y_true,y_pred,threshold)
        df_results_train = df_results_train.append({'threshold':threshold, 'cutoff':exp_cutoff, 'precision':round(precision,3), 'recall':round(recall,3), 'fscore':round(fscore,3), 'micro_fscore':round(micro_fscore,3), 'map':round(map_,3)}, ignore_index=True)

df_results_train.sort_values(by='fscore', ascending=False).to_csv("../experiments/" + corpus + scenario + "train_" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_weighted" + str(weighted) + "_" + time_period + ".tsv", sep="\t")

In [None]:
# -------------------------------------------------------
# Apply the best cutoff and threshold to the test set

# Load the parameters file learned from the train set
dataset_test_df = pd.read_pickle(exp_test_path)
best_threshold = None
best_cutoff = None
parameters_file = Path("../experiments/" + corpus + scenario + "train_" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_weighted" + str(weighted) + "_" + time_period + ".tsv")
if parameters_file.exists():
    best_threshold = float(pd.read_csv(parameters_file, sep="\t").loc[0]['threshold'])
    best_cutoff = int(pd.read_csv(parameters_file, sep="\t").loc[0]['cutoff'])
else:
    print("You haven't found the optimal parameters yet.")

# Apply the best cutoff and threshold to the test set
if not best_threshold is None:
    df_results_test = pd.DataFrame(columns = ['threshold', 'cutoff', 'precision', 'recall', 'fscore', 'micro_fscore', 'map'])
    setting = (best_threshold,best_cutoff)
    predicted = dataset_test_df['predicted'].tolist()
    scores = dataset_test_df['scores'].tolist()

    y_pred = [animacy_evaluation.animacy_score(predicted[x], scores[x],weighted,best_cutoff) for x in range(len(predicted))]
    y_true = [x for x in dataset_test_df['animated'].tolist()]

    precision, recall, fscore, micro_fscore,map_ = animacy_evaluation.results(y_true,y_pred,best_threshold)
    df_results_test = df_results_test.append({'threshold':best_threshold, 'cutoff':best_cutoff, 'precision':round(precision,3), 'recall':round(recall,3), 'fscore':round(fscore,3), 'micro_fscore':round(micro_fscore,3), 'map':round(map_,3)}, ignore_index=True)
    df_results_test.sort_values(by='fscore', ascending=False).to_csv("../experiments/" + corpus + scenario + "test_" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_weighted" + str(weighted) + "_" + time_period + ".tsv", sep="\t")
        
    print("\nContext: "+context+"\nPred_approach: "+str(prediction_approach)+"\nWeighted: "+str(weighted)+"\nTimePeriod: "+str(time_period)+"\nWSD: " + str(wsd) + "\n\n(t=" + str(round(best_threshold,2)) + ", c=" + str(int(best_cutoff)) + ") & " + str(round(precision,3)) + " & " + str(round(recall,3)) + " & " + str(round(fscore,3)) + " & " + str(round(map_, 3)) + " \\\\")

    with open("../experiments/" + corpus + scenario + "results.txt", "a") as fw:
        fw.write("\nCorpus: " + corpus + "\nContext: "+context+"\nPred_approach: "+str(prediction_approach)+"\nWeighted: "+str(weighted)+"\nTimePeriod: "+str(time_period)+"\nWSD: " + str(wsd) + "\n\n(t=" + str(round(best_threshold,2)) + ", c=" + str(int(best_cutoff)) + ") & " + str(round(precision,3)) + " & " + str(round(recall,3)) + " & " + str(round(fscore,3)) + " & " + str(round(map_, 3)) + " \\\\\n\n==========================\n")