In [7]:
import torch
import seaborn as sns
import pandas as pd
import transformers
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
sns.set()

## Year Prediction

In [8]:
test_data = load_from_disk('/datadrive_2/HMD_chunked_100_test')

In [88]:
test_data

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr'],
    num_rows: 11315511
})

In [89]:
test_data = test_data.map(lambda examples: {'sentences': [x.lower() for x in examples['sentences']]}, batched=True)

  0%|          | 0/11316 [00:00<?, ?ba/s]

In [138]:
random_select = False
if random_select:
    test_set = test_data.shuffle(seed=42).select(range(1000))
else:
    test_set = test_data.filter(lambda x: (x['year'] <= 1830) and (x['ocr'] < .5)).shuffle(seed=42).select(range(10000))

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-c79e83b2a6f33b3e.arrow
Loading cached shuffled indices for dataset at /datadrive_2/HMD_chunked_100_test/cache-42def9fedd91341f.arrow


In [139]:
masked_year = True
if masked_year:
    import re
    pattern = re.compile(r'\b1[789][0-9]{2}\b')
    test_set = test_set.map(lambda x: {'sentences': pattern.sub('[MASK]',x['sentences'])})

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [140]:
#test_set['sentences']

In [141]:
def mask_time_token(example,special_token='SEP'):
    return {f'masked_{special_token}': f'[MASK] [{special_token}] '+ example['sentences']}
test_set = test_set.map(mask_time_token, fn_kwargs={'special_token':'SEP'})
test_set = test_set.map(mask_time_token, fn_kwargs={'special_token':'DATE'})

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [142]:
checkpoints = [('bnert-time-st-y','/datadrive_2/bnert-time-st-y','SEP'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','DATE'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','DATE'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','DATE')]

model_dict = defaultdict(dict)
for name,checkpoint, st in checkpoints:
    model_dict[name]['model'] = AutoModelForMaskedLM.from_pretrained(checkpoint)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    model_dict[name]['special_token'] = st

In [143]:
test_set[10]

{'year': 1814,
 'nlp': 2647,
 'pol': '[rad]',
 'loc': '[london]',
 'sentences': 'a i..egitt watch, acclaiming, that he wouiki hick her ie i 1 "\'i " . 417 • \' thea.rpaanhance the awe „whom c0=7404.78•4101.0404,014.pretillimit mi. the smuts, for the sole parpoes.of re.. and take care that sha.would not ilia,.htielipildpeto4- fill: - wo\'s:. ritds of "\'bare • • •- • .treivieg, &peak& and • counting the votes for • presi. • • sling the littlw.gots. their penalties tilt bridew, ... ,;,.:. ~\' \'°.,.. ~,....ttel. ker. ..uhate\'4°,,b.l!"/,lndi•niestriii,, es44.o..b.i""bit. he they gladly paid illigketbillwittdent.; mut sat after he shalt bit chosen, the can. ala nieman called to coefgioo!s..othellplet -•- 5111% -- aikt• -.----- •-\'•',
 'ocr': 0.2851,
 'masked_SEP': '[MASK] [SEP] a i..egitt watch, acclaiming, that he wouiki hick her ie i 1 "\'i " . 417 • \' thea.rpaanhance the awe „whom c0=7404.78•4101.0404,014.pretillimit mi. the smuts, for the sole parpoes.of re.. and take care that sha.

In [144]:



def get_masked_batches(data,st,batch_size=128):
    return (data[i:i+batch_size][f'masked_{st}'] for i in range(0,len(data), batch_size))

def get_year_prediction(data,model,tokenizer,st,mask_position=1):
    predictions = []
    batches = get_masked_batches(data,st)
    for batch in tqdm(batches):
        inputs = tokenizer(batch, return_tensors='pt', padding='max_length', max_length=256, truncation=True,)
        outputs = model(**inputs)
        #torch.cuda.empty_cache()
        predictions.extend([tokenizer.decode(i.item()) for i in outputs.logits[:,mask_position,:].argmax(dim=-1)])
    return predictions

In [145]:
#torch.cuda.empty_cache()
result_dict = {}
for name, mdict in model_dict.items():
    result_dict[name] = get_year_prediction(test_set, 
                                          model_dict[name]['model'],
                                          model_dict[name]['tokenizer'],
                                          model_dict[name]['special_token'])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [146]:
if random_select:
    years = np.array(test_data['year'])
else:
    years = np.array(test_data['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

1846

In [147]:
def random_baseline(example):
    predicted = int(np.random.choice(years, size=1))
    target_year = int(example['year'])
    return {'diff_random':abs(target_year-predicted)}


def majority_baseline(example):
    predicted = int(np.random.choice(years, size=1))
    return {'diff_majority':abs(1813-predicted)}

In [148]:
test_set = test_set.map(random_baseline)
test_set = test_set.map(majority_baseline)

  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [149]:
results = test_set.remove_columns([ 'pol', 'loc', 'masked_SEP','masked_DATE'])

In [150]:
results_df = results.to_pandas()

In [133]:
for ch,res in result_dict.items():
    results_df[ch] = res

In [134]:
results_df

Unnamed: 0,year,nlp,sentences,ocr,diff_random,diff_majority,bnert-time-st-y,bnert-time-y,bnert-time-y_masked_25,bnert-time-y_masked_75
0,1819,2647,a f 1 elects of the i' plat ammerslia tile. ne...,0.4530,26,33,[1813],1819,1819,1819
1,1805,2194,"e4!•rtption, ath lai i ,141 r 45is i , 7.1 4 c...",0.3123,28,25,[1814],1813,1813,1805
2,1827,2646,my furnitore ws irma bon. east inwa c ny's ser...,0.4863,12,27,[1819],1820,1820,1812
3,1819,2647,"to. the present practise, %vela ~ is s flpfeed...",0.4844,25,39,[1814],1813,1813,1813
4,1822,2647,"4.:: um• raft,varies et -reit snaky' dot ef-se...",0.4629,7,56,[1819],1819,1819,1819
...,...,...,...,...,...,...,...,...,...,...
995,1802,2194,"iiarr ied:- on gaturday, at cbadljngton„ benja...",0.3766,27,45,[1814],1810,1814,1801
996,1811,2194,".1 a' k. i? p 1.4 g fit).ast. paraaise-fotv, r...",0.3923,58,31,[1813],1813,1813,1805
997,1813,2647,appseraggillif ftwir large po.metes - pleat* k...,0.3860,17,54,[1819],1819,1819,1814
998,1811,2194,"tt, grit • nfikets nt stainh••., •"" ad' grirat...",0.4751,39,37,[1810],1808,1810,1810


In [135]:
for c in results_df.columns:
    if c.startswith('bnert'):
        results_df[f'diff-{c}'] = results_df.apply(lambda x: abs(x.year - int(x[c].lstrip('[').rstrip(']'))),
                                              axis=1)

In [136]:
results_df.to_csv('tables/tm_year_predict_no_dates.csv')

In [137]:
print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())

\begin{tabular}{lr}
\toprule
{} &       0 \\
\midrule
diff\_random                 &  32.343 \\
diff\_majority               &  33.452 \\
diff-bnert-time-st-y        &   7.523 \\
diff-bnert-time-y           &   7.087 \\
diff-bnert-time-y\_masked\_25 &   7.311 \\
diff-bnert-time-y\_masked\_75 &   7.182 \\
\bottomrule
\end{tabular}



  print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())


In [38]:
print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())

\begin{tabular}{lr}
\toprule
{} &       0 \\
\midrule
diff\_random                 &  19.602 \\
diff\_majority               &  13.096 \\
diff-bnert-time-st-y        &   9.420 \\
diff-bnert-time-y           &   8.235 \\
diff-bnert-time-y\_masked\_25 &   7.353 \\
diff-bnert-time-y\_masked\_75 &   6.066 \\
\bottomrule
\end{tabular}



  print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())


## Inspect Results
    

In [77]:
results_df_nodates = pd.read_csv('tables/tm_year_predict_no_dates.csv')
results_df_withdates = pd.read_csv('tables/tm_year_predict_standard.csv')

In [84]:
print(pd.concat([results_df_nodates[[c for c in results_df_nodates.columns if c.startswith('diff')]].mean(axis=0),
           results_df_withdates[[c for c in results_df_nodates.columns if c.startswith('diff')]].mean(axis=0)],axis=1
                 ).round(3).to_latex())

\begin{tabular}{lrr}
\toprule
{} &       0 &       1 \\
\midrule
diff\_random                 &  19.458 &  19.602 \\
diff\_majority               &  13.929 &  13.096 \\
diff-bnert-time-st-y        &   9.744 &   9.420 \\
diff-bnert-time-y           &   8.625 &   8.235 \\
diff-bnert-time-y\_masked\_25 &   7.756 &   7.353 \\
diff-bnert-time-y\_masked\_75 &   6.317 &   6.066 \\
\bottomrule
\end{tabular}



  print(pd.concat([results_df_nodates[[c for c in results_df_nodates.columns if c.startswith('diff')]].mean(axis=0),


# Fin.

In [2]:
import pandas as pd
results_df = pd.read_csv('tables/year_pred.csv')

In [3]:
results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_csv('tables/year_pred.csv')

In [37]:
results_df

Unnamed: 0,year,nlp,sentences,ocr,diff_random,diff_majority,bnert-time-st-y,bnert-time-y,bnert-time-y_masked_25,bnert-time-y_masked_75,diff-bnert-time-st-y,diff-bnert-time-y,diff-bnert-time-y_masked_25,diff-bnert-time-y_masked_75
0,1859,2084,"sitenirranv, roy institute as tou ry, v. ireno...",0.6430,22,11,[1859],1859,1860,1851,0,0,1,8
1,1850,2194,"atveze,and she had a heavy beam-sea during the...",0.8156,8,12,[1853],1853,1853,1853,3,3,3,3
2,1823,2194,withsuch regulations as we have alluded to. we...,0.9192,24,11,[1844],1844,1844,1844,21,21,21,21
3,1848,2642,atthe several statutes and proceedings of the ...,0.8049,21,5,[1831],1831,1831,1831,17,17,17,17
4,1848,2194,"paid, to any address, secure fromobservation, ...",0.8958,3,15,[1844],1844,1846,1848,4,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1846,2194,"thumb takes the giants down apeg or two—"" from...",0.7595,0,35,[1856],1850,1849,1844,10,4,3,2
996,1823,2194,"price of raw fat, per stone of glh.--d""'6',dpr...",0.6541,31,17,[1846],1847,1847,1831,23,24,24,8
997,1847,2194,of the various disorders arising from the foll...,0.5860,7,10,[1845],1846,1845,1845,2,1,2,2
998,1845,2194,"cinctitatt7wirmeon, friday.—the queen promenad...",0.9586,37,9,[1844],1844,1844,1844,1,1,1,1


In [None]:
batches = get_sent_batches(test_set)

In [None]:
batches = list(batches)

In [None]:
len(batches[0])

In [None]:
batch = batches[0]
len(batch)

In [None]:
batch[0]

In [None]:
inputs = tokenizer(batch, return_tensors='pt', padding='longest')
outputs = model(**inputs)

In [None]:
tokenizer.decode(outputs.logits[:,1,:].argmax(dim=-1))

In [None]:
mask_filler('[MASK] [SEP] Her Majesty the Queen.')

In [None]:
years = np.array(test_data['train']['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

In [None]:
np.mean(test_set['diff_random'])

In [None]:
np.mean(test_set['diff_majority'])

In [None]:
def diff_first_prediction(example):
    #try:
        text = '[MASK] [MET] ' + example['sentences']
        tokenized = tokenizer(text)
        if len(tokenized['input_ids']) > 512:
            print(len(text))
            print(text)
            text = tokenizer.decode(tokenized['input_ids'][:500]) + ' [SEP]'
            print(text)
            print(len(text))
        predictions = mask_filler(text)
        target_year = int(example['year'])
        print(predictions)
        pred_year = predictions[0]['token_str'].rstrip(']').lstrip('[')
        print(pred_year)
        return {'diff':abs(target_year-int(pred_year))}


In [None]:
test_set = test_set.map(diff_first_prediction)

In [None]:
data = test_set.remove_columns(['nlp', 'pol', 'sentences',])
data = data.to_pandas()
data['diff'].mean()

In [None]:
data['diff_majority'] = test_set['diff_majority']
data['diff_random'] = test_set['diff_random']
data[['diff_majority','diff_random','diff']].plot(kind='density')

In [None]:
import pandas as pd
pd.DataFrame()

In [None]:
sns.scatterplot(x='year',y='diff',data=data)

In [None]:

X = data[['year','ocr']]
y = data['diff']
reg = LinearRegression().fit(X, y)


In [None]:
reg.score(X, y)

In [None]:
import numpy as np
np.mean(test_set['diff'])

In [None]:
predictions = mask_filler('[MASK] [SEP] Hello, my Queen.')

In [None]:
predictions

## Masking Pipeline

In [None]:
"Mr. Gladstone might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
"Mr. Disraeli, however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."


sent = "The Prime Minister, Mr. [MASK] might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
#sent = "Mr. Peel might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye. Mr. [MASK], however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."



In [None]:

tokenizer = AutoTokenizer.from_pretrained("/datadrive_2/bnert_time")
mask_filler = pipeline(
    "fill-mask", model="/datadrive_2/bnert_time", top_k=5, tokenizer=tokenizer
)


In [None]:
text = f"1830 [SEP] {sent}"
#text = '[MASK] [SEP] His Majesty spoke to the people.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

## Loading Model and Dataset

### Dataset

In [None]:
cache_dir = '/datadrive_2/hf_cache/'
dataset = load_dataset("davanstrien/hmd_newspapers", cache_dir=cache_dir)


In [None]:
import re
pattern = re.compile(r'(\bprime\sminister\b)', re.I)
#pattern.findall("gladstone  d'isreali")

In [None]:
prm = dataset.filter(lambda x: len(pattern.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
a = prm.filter(lambda x: x['date'].year > 1850)

In [None]:
pattern1 = re.compile(r'(\bgladstone|\bisreali\b)', re.I)
prm1 = a.filter(lambda x: len(pattern1.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
prm1['train'][4]

In [None]:
preds = [mask_filler('[MASK] [SEP] '+ text[:900]) for text in prm['train']['text'][:100]]

In [None]:
#preds

In [None]:
### Model

In [None]:
model_checkpoint = "bnert"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## Extracting Vectors