In [1]:
import torch
import seaborn as sns

In [4]:

import pandas as pd
import transformers
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
sns.set()

## Year Prediction

In [4]:
test_data = load_from_disk('/datadrive_2/frozen_corpus')

In [5]:
test_data

DatasetDict({
    train: Dataset({
        features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length'],
        num_rows: 5234550
    })
    test: Dataset({
        features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length'],
        num_rows: 581857
    })
})

In [33]:
test_set = test_data['test'].shuffle(seed=42).select(range(10000))


Loading cached shuffled indices for dataset at /datadrive_2/frozen_corpus/test/cache-5b54b43a305c6ec8.arrow


In [34]:
def mask_time_token(example):
    return {'masked': '[MASK] [SEP] '+ example['sentences']}
test_set = test_set.map(mask_time_token)

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-8078932f57f58a96.arrow


In [35]:
test_set

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length', 'masked'],
    num_rows: 10000
})

In [36]:
path = Path('/datadrive_2')
checkpoints = ['bnert_time','bnert-time-st-y']
model_dict = defaultdict(dict)

for checkpoint in checkpoints:
    model_dict[checkpoint]['model'] = AutoModelForMaskedLM.from_pretrained(path / checkpoint)#.to('cuda')
    model_dict[checkpoint]['tokenizer'] = AutoTokenizer.from_pretrained(path / checkpoint)


In [37]:


def get_masked_batches(data,batch_size=128):
    return (data[i:i+batch_size]['masked'] for i in range(0,len(data), batch_size))

def get_year_prediction(data,model,tokenizer,mask_position=1):
    predictions = []
    batches = get_masked_batches(data)
    for batch in tqdm(batches):
        inputs = tokenizer(batch, return_tensors='pt', padding='max_length', max_length=256, truncation=True,)
        outputs = model(**inputs)
        #torch.cuda.empty_cache()
        predictions.extend([tokenizer.decode(i.item()) for i in outputs.logits[:,mask_position,:].argmax(dim=-1)])
    return predictions

In [None]:
#torch.cuda.empty_cache()
result_dict = {}
for ch in checkpoints:
    result_dict[ch] = get_year_prediction(test_set, 
                                          model_dict[ch]['model'],
                                          model_dict[ch]['tokenizer'])

0it [00:00, ?it/s]

In [None]:
years = np.array(test_data['train']['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

In [None]:
def random_baseline(example):
    predicted = int(np.random.choice(years, size=1))
    target_year = int(example['year'])
    return {'diff_random':abs(target_year-predicted)}


def majority_baseline(example):
    predicted = int(np.random.choice(years, size=1))
    return {'diff_majority':abs(1846-predicted)}

In [None]:
test_set = test_set.map(random_baseline)
test_set = test_set.map(majority_baseline)

In [18]:
results = test_set.remove_columns([ 'pol', 'loc', 'masked'])

In [19]:
results_df = results.to_pandas()

In [20]:
for ch,res in result_dict.items():
    results_df[ch] = res

In [28]:
results_df['diff-time-ts'] = results_df.apply(lambda x: abs(x.year - int(x['bnert-time-st-y'].lstrip('[').rstrip(']'))),
                                              axis=1)

In [29]:
results_df['diff-time-no-ts'] = results_df.apply(lambda x: abs(x.year - int(x['bnert_time'])),
                                              axis=1)

In [32]:
print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())

\begin{tabular}{lr}
\toprule
{} &       0 \\
\midrule
diff\_random     &  18.783 \\
diff\_majority   &  12.921 \\
diff-time-ts    &   8.531 \\
diff-time-no-ts &   7.315 \\
\bottomrule
\end{tabular}



  print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())


In [None]:
batches = get_sent_batches(test_set)

In [None]:
batches = list(batches)

In [None]:
len(batches[0])

In [None]:
batch = batches[0]
len(batch)

In [None]:
batch[0]

In [None]:
inputs = tokenizer(batch, return_tensors='pt', padding='longest')
outputs = model(**inputs)

In [None]:
tokenizer.decode(outputs.logits[:,1,:].argmax(dim=-1))

In [None]:
mask_filler('[MASK] [SEP] Her Majesty the Queen.')

In [None]:
years = np.array(test_data['train']['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

In [None]:
np.mean(test_set['diff_random'])

In [None]:
np.mean(test_set['diff_majority'])

In [None]:
def diff_first_prediction(example):
    #try:
        text = '[MASK] [MET] ' + example['sentences']
        tokenized = tokenizer(text)
        if len(tokenized['input_ids']) > 512:
            print(len(text))
            print(text)
            text = tokenizer.decode(tokenized['input_ids'][:500]) + ' [SEP]'
            print(text)
            print(len(text))
        predictions = mask_filler(text)
        target_year = int(example['year'])
        print(predictions)
        pred_year = predictions[0]['token_str'].rstrip(']').lstrip('[')
        print(pred_year)
        return {'diff':abs(target_year-int(pred_year))}


In [None]:
test_set = test_set.map(diff_first_prediction)

In [None]:
data = test_set.remove_columns(['nlp', 'pol', 'sentences',])
data = data.to_pandas()
data['diff'].mean()

In [None]:
data['diff_majority'] = test_set['diff_majority']
data['diff_random'] = test_set['diff_random']
data[['diff_majority','diff_random','diff']].plot(kind='density')

In [None]:
import pandas as pd
pd.DataFrame()

In [None]:
sns.scatterplot(x='year',y='diff',data=data)

In [None]:

X = data[['year','ocr']]
y = data['diff']
reg = LinearRegression().fit(X, y)


In [None]:
reg.score(X, y)

In [None]:
import numpy as np
np.mean(test_set['diff'])

In [None]:
predictions = mask_filler('[MASK] [SEP] Hello, my Queen.')

In [None]:
predictions

## Masking Pipeline

In [None]:
"Mr. Gladstone might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
"Mr. Disraeli, however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."


sent = "The Prime Minister, Mr. [MASK] might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
#sent = "Mr. Peel might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye. Mr. [MASK], however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."



In [None]:

tokenizer = AutoTokenizer.from_pretrained("/datadrive_2/bnert_time")
mask_filler = pipeline(
    "fill-mask", model="/datadrive_2/bnert_time", top_k=5, tokenizer=tokenizer
)


In [None]:
text = f"1830 [SEP] {sent}"
#text = '[MASK] [SEP] His Majesty spoke to the people.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

## Loading Model and Dataset

### Dataset

In [None]:
cache_dir = '/datadrive_2/hf_cache/'
dataset = load_dataset("davanstrien/hmd_newspapers", cache_dir=cache_dir)


In [None]:
import re
pattern = re.compile(r'(\bprime\sminister\b)', re.I)
#pattern.findall("gladstone  d'isreali")

In [None]:
prm = dataset.filter(lambda x: len(pattern.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
a = prm.filter(lambda x: x['date'].year > 1850)

In [None]:
pattern1 = re.compile(r'(\bgladstone|\bisreali\b)', re.I)
prm1 = a.filter(lambda x: len(pattern1.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
prm1['train'][4]

In [None]:
preds = [mask_filler('[MASK] [SEP] '+ text[:900]) for text in prm['train']['text'][:100]]

In [None]:
#preds

In [None]:
### Model

In [None]:
model_checkpoint = "bnert"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## Extracting Vectors