In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer,BertForMaskedLM
from sentence_transformers import SentenceTransformer
from itertools import chain
import pandas as pd
import numpy as np
import spacy
import operator
from pathlib import Path
from scipy import stats
import unidecode
from collections import Counter
import sklearn
from tqdm import tqdm_notebook as tqdm
from tools import animacy_detection

In [2]:
overwrite = False

# -------------------------------------------------------
# Choose scenario:

prediction_approach = "bert_masking"
# Options:
# * "bert_masking" (string): BERT masking approach

context = "both"
# Options:
# * "sent" (string): use the sentence where the target expression is located as input.
# * "both" (string): use the sentence where the target expression is located, plus
#                    the previous and next sentences, as input.

time_period = "before1850"
# Options:
# * "contemporary" (string): bert-base-uncased (contemporary BERT)
# * "before1850" (string, only for 19thC Machines): BERT trained on pre-1850 data
# * "from1850to1875" (string, only for 19thC Machines): BERT trained on data from 1850 to 1875
# * "from1875to1890" (string, only for 19thC Machines): BERT trained on data from 1875 to 1890
# * "from1890to1900" (string, only for 19thC Machines): BERT trained on data from 1890 to 1900
# * "timeSensitive" (string, only for 19thC Machines): BERT model appropriate fine-tuned BERT model of the period to which each sentence belongsto

weighted = True
# Options:
# * True (boolean): animacy values of predicted tokens are averaged weighted by their probability score.
# * False (boolean): animacy values of predicted tokens are averaged not weighted by their probability score.

wsd = "False"
# Options:
# * "False" (string): first sense in WordNet
# * "bert" (string): use BERT-adapted Lesk algorithm to perform sense disambiguation from WordNet

words_cutoff = 100
# Words cutoff (integer): number of predictions for a given MASK

In [3]:
# -------------------------------------------------------
# Instantiate default values:
language_model = None

bert_models = {"contemporary": BertForMaskedLM.from_pretrained('bert-base-uncased')
               , "before1850": BertForMaskedLM.from_pretrained("../models/FT_bert_base_uncased_before_1850")
               , "from1850to1875": BertForMaskedLM.from_pretrained("../models/FT_bert_base_uncased_after_1850_before_1875")
               , "from1875to1890": BertForMaskedLM.from_pretrained("../models/FT_bert_base_uncased_after_1875_before_1890")
               , "from1890to1900": BertForMaskedLM.from_pretrained("../models/FT_bert_base_uncased_after_1890_before_1900")
              }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # BERT tokenizer is always the same

wsdmodel = None
if wsd == "bert":
    wsdmodel = SentenceTransformer('../models/language_models/bert_models/bert-base-nli-mean-tokens')

In [4]:
# -------------------------------------------------------
# Load dataset and run:
dataset_df = pd.read_pickle("../data/jsa_animacy.pkl")
exp_path = "../experiments/" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_" + time_period + ".pkl"

In [5]:
animacy_detection.predict_mask_animacy(exp_path, dataset_df, context, words_cutoff, prediction_approach, wsd, wsdmodel, tokenizer, language_model, bert_models, time_period, overwrite)

In [6]:
predictions_df = pd.read_pickle(exp_path)
predictions_df["animacy_score_20pr"] = predictions_df.apply(lambda row: animacy_detection.animacy_score(row["predicted"], row["scores"], weighted, 20), axis=1)
predictions_df["animacy_score_100pr"] = predictions_df.apply(lambda row: animacy_detection.animacy_score(row["predicted"], row["scores"], weighted, 100), axis=1)
predictions_df.to_csv("../experiments/predicted_" + context + "_wordsCutoff" + str(words_cutoff) + "_" + prediction_approach + "_wsd" + str(wsd) + "_" + time_period + ".tsv", sep="\t", index=False)

### WHAT DO WE DO WITH COMPOUND NOUNS, EG:

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
doc = nlp("His patent expanding piston has been employed by them in steam engines.")

In [None]:
for token in doc:
    print(token.text, token.pos_, token.dep_)