In [17]:
import torch
import seaborn as sns
import pandas as pd
import transformers
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
sns.set()

## Year Prediction

In [18]:
test_data = load_from_disk('/datadrive_2/frozen_corpus')

In [19]:
test_data

DatasetDict({
    train: Dataset({
        features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length'],
        num_rows: 5234550
    })
    test: Dataset({
        features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length'],
        num_rows: 581857
    })
})

In [20]:
test_data = test_data.map(lambda examples: {'sentences': [x.lower() for x in examples['sentences']]}, batched=True)

Loading cached processed dataset at /datadrive_2/frozen_corpus/train/cache-18daae07a5d7f4c8.arrow
Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-573058092beebe7f.arrow


In [21]:
test_set = test_data['test'].shuffle(seed=42).select(range(10000))


Loading cached shuffled indices for dataset at /datadrive_2/frozen_corpus/test/cache-8bc902c778a6e7a6.arrow


In [22]:
masked_year = True
if masked_year:
    import re
    pattern = re.compile(r'\b1[789][0-9]{2}\b')
    test_set = test_set.map(lambda x: {'sentences': pattern.sub('[MASK]',x['sentences'])})

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-20b4c5659e03f52f.arrow


In [23]:
#test_set['sentences']

In [24]:
def mask_time_token(example,special_token='SEP'):
    return {f'masked_{special_token}': f'[MASK] [{special_token}] '+ example['sentences']}
test_set = test_set.map(mask_time_token, fn_kwargs={'special_token':'SEP'})
test_set = test_set.map(mask_time_token, fn_kwargs={'special_token':'DATE'})

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-882e4f56562607c5.arrow
Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-30718c510e8e1f5c.arrow


In [25]:
checkpoints = [('bnert-time-st-y','/datadrive_2/bnert-time-st-y','SEP'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','DATE'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','DATE'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','DATE')]

model_dict = defaultdict(dict)
for name,checkpoint, st in checkpoints:
    model_dict[name]['model'] = AutoModelForMaskedLM.from_pretrained(checkpoint)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    model_dict[name]['special_token'] = st

In [26]:
test_set[0]

{'year': 1857,
 'nlp': 2194,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': "to alexandria some daysago, after having successfully performed the objectof her mission, which was undertake', for the pur-pose of taking deep-water soundings between alex-andria, rhodes, and candia. the greatest depthof water on the line to rhodes, i am inf.: ,rmed, wasabout 1,650 fathoms, and on that between alex-andria and candia about 1,700 fathoms. the bot-tom was found to consist generally of yellow mud.the sounding apparatus, i understand, was en-tirely constrected on board.captain mansell, at the'request of the viceroy, isat present engaged in a general survey of the coastsof the red sea, to the distance of some miles oneither side of",
 'ocr': 0.9506,
 'length': 100,
 'masked_SEP': "[MASK] [SEP] to alexandria some daysago, after having successfully performed the objectof her mission, which was undertake', for the pur-pose of taking deep-water soundings between alex-andria, rhodes, and candia. the

In [27]:


def get_masked_batches(data,st,batch_size=128):
    return (data[i:i+batch_size][f'masked_{st}'] for i in range(0,len(data), batch_size))

def get_year_prediction(data,model,tokenizer,st,mask_position=1):
    predictions = []
    batches = get_masked_batches(data,st)
    for batch in tqdm(batches):
        inputs = tokenizer(batch, return_tensors='pt', padding='max_length', max_length=256, truncation=True,)
        outputs = model(**inputs)
        #torch.cuda.empty_cache()
        predictions.extend([tokenizer.decode(i.item()) for i in outputs.logits[:,mask_position,:].argmax(dim=-1)])
    return predictions

In [28]:
#torch.cuda.empty_cache()
result_dict = {}
for name, mdict in model_dict.items():
    result_dict[name] = get_year_prediction(test_set, 
                                          model_dict[name]['model'],
                                          model_dict[name]['tokenizer'],
                                          model_dict[name]['special_token'])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [29]:
years = np.array(test_data['train']['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

1846

In [30]:
def random_baseline(example):
    predicted = int(np.random.choice(years, size=1))
    target_year = int(example['year'])
    return {'diff_random':abs(target_year-predicted)}


def majority_baseline(example):
    predicted = int(np.random.choice(years, size=1))
    return {'diff_majority':abs(1846-predicted)}

In [31]:
test_set = test_set.map(random_baseline)
test_set = test_set.map(majority_baseline)

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-84a763a1a21f22e0.arrow
Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-a7359a50594cc2e5.arrow


In [32]:
results = test_set.remove_columns([ 'pol', 'loc', 'masked_SEP','masked_DATE'])

In [33]:
results_df = results.to_pandas()

In [34]:
for ch,res in result_dict.items():
    results_df[ch] = res

In [35]:
results_df

Unnamed: 0,year,nlp,sentences,ocr,length,diff_random,diff_majority,bnert-time-st-y,bnert-time-y,bnert-time-y_masked_25,bnert-time-y_masked_75
0,1857,2194,"to alexandria some daysago, after having succe...",0.9506,100,32,24,[1854],1854,1854,1854
1,1808,2194,of thee. coin.plaints.in all cases of recent o...,0.7470,100,44,18,[1810],1810,1802,1802
2,1864,2194,"july last,""his head drooped a little, and ther...",0.9392,100,22,28,[1864],1864,1864,1864
3,1858,2194,"golden grain. the weather has,since our last r...",0.9038,100,11,2,[1846],1846,1846,1846
4,1808,2646,"treated him with the utmost attention.4, the c...",0.9250,100,5,11,[1823],1823,1823,1823
...,...,...,...,...,...,...,...,...,...,...,...
9995,1842,2194,by the hon. member for roxburghshire against t...,0.9647,100,12,14,[1846],1846,1846,1846
9996,1803,2194,"shock ofan earthquake at christiana, in nqrway...",0.7931,100,3,14,[1820],1823,1823,1851
9997,1852,2194,house of datontion.the prisoner:—oh i good hea...,0.8132,100,3,16,[1822],1822,1820,1857
9998,1861,2194,"defendant, mr.clark, and the husband of the la...",0.9115,100,49,6,[1844],1844,1844,1844


In [36]:
for c in results_df.columns:
    if c.startswith('bnert'):
        results_df[f'diff-{c}'] = results_df.apply(lambda x: abs(x.year - int(x[c].lstrip('[').rstrip(']'))),
                                              axis=1)

In [37]:
results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_csv('tables/year_pred.csv')

In [38]:
print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())

\begin{tabular}{lr}
\toprule
{} &        0 \\
\midrule
diff\_random                 &  19.2467 \\
diff\_majority               &  13.8074 \\
diff-bnert-time-st-y        &   9.1536 \\
diff-bnert-time-y           &   8.4410 \\
diff-bnert-time-y\_masked\_25 &   7.5401 \\
diff-bnert-time-y\_masked\_75 &   6.4239 \\
\bottomrule
\end{tabular}



  print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())


# Fin.

In [None]:
batches = get_sent_batches(test_set)

In [None]:
batches = list(batches)

In [None]:
len(batches[0])

In [None]:
batch = batches[0]
len(batch)

In [None]:
batch[0]

In [None]:
inputs = tokenizer(batch, return_tensors='pt', padding='longest')
outputs = model(**inputs)

In [None]:
tokenizer.decode(outputs.logits[:,1,:].argmax(dim=-1))

In [None]:
mask_filler('[MASK] [SEP] Her Majesty the Queen.')

In [None]:
years = np.array(test_data['train']['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

In [None]:
np.mean(test_set['diff_random'])

In [None]:
np.mean(test_set['diff_majority'])

In [None]:
def diff_first_prediction(example):
    #try:
        text = '[MASK] [MET] ' + example['sentences']
        tokenized = tokenizer(text)
        if len(tokenized['input_ids']) > 512:
            print(len(text))
            print(text)
            text = tokenizer.decode(tokenized['input_ids'][:500]) + ' [SEP]'
            print(text)
            print(len(text))
        predictions = mask_filler(text)
        target_year = int(example['year'])
        print(predictions)
        pred_year = predictions[0]['token_str'].rstrip(']').lstrip('[')
        print(pred_year)
        return {'diff':abs(target_year-int(pred_year))}


In [None]:
test_set = test_set.map(diff_first_prediction)

In [None]:
data = test_set.remove_columns(['nlp', 'pol', 'sentences',])
data = data.to_pandas()
data['diff'].mean()

In [None]:
data['diff_majority'] = test_set['diff_majority']
data['diff_random'] = test_set['diff_random']
data[['diff_majority','diff_random','diff']].plot(kind='density')

In [None]:
import pandas as pd
pd.DataFrame()

In [None]:
sns.scatterplot(x='year',y='diff',data=data)

In [None]:

X = data[['year','ocr']]
y = data['diff']
reg = LinearRegression().fit(X, y)


In [None]:
reg.score(X, y)

In [None]:
import numpy as np
np.mean(test_set['diff'])

In [None]:
predictions = mask_filler('[MASK] [SEP] Hello, my Queen.')

In [None]:
predictions

## Masking Pipeline

In [None]:
"Mr. Gladstone might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
"Mr. Disraeli, however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."


sent = "The Prime Minister, Mr. [MASK] might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
#sent = "Mr. Peel might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye. Mr. [MASK], however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."



In [None]:

tokenizer = AutoTokenizer.from_pretrained("/datadrive_2/bnert_time")
mask_filler = pipeline(
    "fill-mask", model="/datadrive_2/bnert_time", top_k=5, tokenizer=tokenizer
)


In [None]:
text = f"1830 [SEP] {sent}"
#text = '[MASK] [SEP] His Majesty spoke to the people.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

## Loading Model and Dataset

### Dataset

In [None]:
cache_dir = '/datadrive_2/hf_cache/'
dataset = load_dataset("davanstrien/hmd_newspapers", cache_dir=cache_dir)


In [None]:
import re
pattern = re.compile(r'(\bprime\sminister\b)', re.I)
#pattern.findall("gladstone  d'isreali")

In [None]:
prm = dataset.filter(lambda x: len(pattern.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
a = prm.filter(lambda x: x['date'].year > 1850)

In [None]:
pattern1 = re.compile(r'(\bgladstone|\bisreali\b)', re.I)
prm1 = a.filter(lambda x: len(pattern1.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
prm1['train'][4]

In [None]:
preds = [mask_filler('[MASK] [SEP] '+ text[:900]) for text in prm['train']['text'][:100]]

In [None]:
#preds

In [None]:
### Model

In [None]:
model_checkpoint = "bnert"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## Extracting Vectors