In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from  pathlib import Path

In [None]:
from utils import process_jsa
from utils import process_rsc
from utils import prepare_sents
from utils import explore_preds

#### Process  the JSA corpus

In [None]:
input_path = "../../workspace/data/" # Path where JSA data is located
output_path = "data/jsa_processed/"
overwrite = False # If False, run the code only if output has not been created.
                  # If True, run the code regardless.

process_jsa.parse_corpus(input_path, output_path, overwrite)

#### Process the RSC corpus

Data downloaded from https://fedora.clarin-d.uni-saarland.de/rsc_v6/access.html#download.

We are using:
* TEI-formatted corpus [v6.0.4](https://fedora.clarin-d.uni-saarland.de/rsc_v6/data/texts/Royal_Society_Corpus_open_v6.0.4_texts_tei.zip) (as separate files).
* Corresponding metadata [v6.0.4](https://fedora.clarin-d.uni-saarland.de/rsc_v6/data/Royal_Society_Corpus_open_v6.0.4_meta.tsv.zip).

In [None]:
input_path = "../../workspace/data/RSC/" # Path where JSA data is located
output_path = "data/rsc_processed/"
overwrite = False # If False, run the code only if output has not been created.
                  # If True, run the code regardless.

process_rsc.parse_corpus(input_path, output_path, overwrite)

#### Get sentences with machines

In [None]:
# Specify the query tokens here:
query_tokens = ["machine", "machines", "engine", "engines"]

In [None]:
corpus = "JSA"
jsa_sents_df = prepare_sents.filter_sents_query(corpus, query_tokens)
jsa_sents_df.to_csv("data/jsa_processed/JSA_machines.tsv", sep="\t")

In [None]:
corpus = "RSC"
rsc_sents_df = prepare_sents.filter_sents_query(corpus, query_tokens)
rsc_sents_df = rsc_sents_df[(rsc_sents_df["year"] >= 1783) & (rsc_sents_df["year"] <= 1908)]
rsc_sents_df.to_csv("data/rsc_processed/RSC_machines.tsv", sep="\t")

#### Syntactic filtering

In [None]:
syndf = jsa_sents_df.copy()
syndf['synt'] = prepare_sents.preprocess_pipe(syndf['currentSentence'], nlp)
syndf = syndf[syndf.apply(lambda x: prepare_sents.filter_sents_synt(x.synt, x.targetExpression), axis=1)]
syndf["query_label"] = syndf.apply(lambda x: prepare_sents.find_query_deplabel(x.synt, x.maskedSentence, x.targetExpression), axis=1)
syndf.to_pickle("data/jsa_processed/JSA_synparsed.pkl")

In [None]:
syndf = rsc_sents_df.copy()
syndf['synt'] = prepare_sents.preprocess_pipe(syndf['currentSentence'], nlp)
syndf = syndf[syndf.apply(lambda x: prepare_sents.filter_sents_synt(x.synt, x.targetExpression), axis=1)]
syndf["query_label"] = syndf.apply(lambda x: prepare_sents.find_query_deplabel(x.synt, x.maskedSentence, x.targetExpression), axis=1)
syndf.to_pickle("data/rsc_processed/RSC_synparsed.pkl")

#### BERT masking

In [None]:
for dataset in ["data/rsc_processed/RSC_synparsed.pkl",
                "data/jsa_processed/JSA_synparsed.pkl"]:
    
    if not Path(dataset.split(".pkl")[0] + "_pred_bert.pkl").is_file():
        
        print(dataset)

        # Load dataframe where to apply this:
        pred_df = pd.read_pickle(dataset)
        for epoch in ["1760_1850", "1850_1875", "1875_1890", "1890_1900"]:

            print(epoch)

            # Create pipeline depending on the BERT model of the specified period
            # and the number of expected predictions:
            pred_toks = 10
            model_rd = explore_preds.create_mask_pipeline(epoch, pred_toks)

            # Use BERT to find most likely predictions for a mask:
            pred_df["pred_bert_" + epoch] = pred_df.apply(lambda x: explore_preds.bert_masking(x, model_rd), axis=1)

        pred_df.to_pickle(dataset.split(".pkl")[0] + "_pred_bert.pkl")