In [9]:
import torch
import seaborn as sns
import pandas as pd
import transformers
from transformers import pipeline, AutoModelForMaskedLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from pathlib import Path
import numpy as np
import re
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
sns.set()

## Pol Prediction

In [4]:
test_data = load_from_disk('/datadrive_2/HMD_chunked_100_test')

In [5]:
test_data

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr'],
    num_rows: 11315511
})

In [6]:
test_data = test_data.map(lambda examples: {'sentences': [x.lower() for x in examples['sentences']]}, batched=True)

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-5f63fae813d1e558.arrow


In [117]:
random_select = False

if random_select:
    test_set = test_data.shuffle(seed=42).select(range(1000))
else:
    pol_pattern = re.compile(r'\bliberal|\bconservat|\btory\b|\btories\b',re.I)
    test_set = test_data.map(lambda x: {'sentences': x['sentences'].lower()}, num_proc=6
                               ).filter(lambda x: len(pol_pattern.findall(x['sentences'])) > 0
                                   ).shuffle(seed=42).select(range(1000))
    

       

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-53ac24fc25cbbd5b.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-104022a30e4270c4.arrow


  

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-ba4b05635d7ee3eb.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-621153ec30285ff3.arrow


  

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-a7ecd51e15792e88.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-918abd0192f9cad2.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-66b22b24541acf77.arrow
Loading cached shuffled indices for dataset at /datadrive_2/HMD_chunked_100_test/cache-3e31e92652ddae9c.arrow


In [118]:
# masked_year = False
# if masked_year:
#     import re
#     pattern = re.compile(r'\b1[789][0-9]{2}\b')
#     test_set = test_set.map(lambda x: {'sentences': pattern.sub('[MASK]',x['sentences'])})

In [119]:
#test_set['sentences']

In [120]:
def mask_time_token(example,special_token='SEP'):
    return {f'masked_{special_token}': f'[MASK] [{special_token}] '+ example['sentences']}

def mask_combined(example, special_token='COMB'):
    #return {f'masked_COMB': f'[MASK] [DATE] [MASK] [POL] [MASK] [LOC] '+ example['sentences']}
    return {f'masked_COMB': f'[MASK] [DATE] [MASK] [POL] [MASK] [LOC] '+ example['sentences']}
test_set = test_set.map(mask_time_token, fn_kwargs={'special_token':'POL'})
test_set = test_set.map(mask_combined)

  0%|          | 0/1000 [00:00<?, ?ex/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [121]:
checkpoints = [('bnert-pol-st','/datadrive_2/bnert-pol-st','POL',1),
               ('bnert-pol','/datadrive_2/bnert-pol','POL',1),
               ('bnert-comb','/datadrive_2/bnert-combined','COMB',3)
               ]

model_dict = defaultdict(dict)
for name,checkpoint, st,mp in checkpoints:
    model_dict[name]['model'] = AutoModelForMaskedLM.from_pretrained(checkpoint)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    model_dict[name]['special_token'] = st
    model_dict[name]['mask_position'] = mp

In [122]:
test_set[10]

{'year': 1867,
 'nlp': 2642,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': 'by a small majority. since that timehe enjoyed the honour of the representation undisturbed.previous to last general election it was understood that hewas anxious to withdraw into private life, and that colonelclark kennedy. c. 8., whose unlooked-for death in egyptwas announced last week, contemplated standing in hisstead ; but at the desire of some of his constituency heagain offered himself, and was again returned. mr.mackie, though an adherent of the liberal party, wassomewhat conservative in his views, and gave considerabledissatisfaction even to many of his friends by the way inwhich he acted during the divisions on the reform bill ofthe',
 'ocr': 0.8552,
 'masked_POL': '[MASK] [POL] by a small majority. since that timehe enjoyed the honour of the representation undisturbed.previous to last general election it was understood that hewas anxious to withdraw into private life, and that colonelclark kenne

In [123]:


def get_masked_batches(data,st,batch_size=128):
    return (data[i:i+batch_size][f'masked_{st}'] for i in range(0,len(data), batch_size))

def get_year_prediction(data,model,tokenizer,st,mask_position):
    predictions = []
    batches = get_masked_batches(data,st)
    for batch in tqdm(batches):
        inputs = tokenizer(batch, return_tensors='pt', padding='max_length', max_length=256, truncation=True,)
        outputs = model(**inputs)
        #torch.cuda.empty_cache()
        predictions.extend([tokenizer.decode(i.item()) for i in outputs.logits[:,mask_position,:].argmax(dim=-1)])
    return predictions

In [124]:
#torch.cuda.empty_cache()
result_dict = {}
for name, mdict in model_dict.items():
    result_dict[name] = get_year_prediction(test_set, 
                                          model_dict[name]['model'],
                                          model_dict[name]['tokenizer'],
                                          model_dict[name]['special_token'],
                                         model_dict[name]['mask_position'])

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [125]:
pols = np.array(test_data['pol'])
mc_pols = Counter(pols).most_common(1)[0][0]; mc_pols

'[lib]'

In [126]:
#st = lambda x: x.lstrip('[').rstrip(']')

In [127]:
lab2string = {'[lib]':'liberal',
              '[con]':'conservative',
              '[neutr]':'neutral',
              '[none]':'none',
              '[rad]':'radical'
             }

def random_baseline(example):
    predicted = lab2string[np.random.choice(pols, size=1)[0]]
    return {
           'random':predicted}


def majority_baseline(example):
    
    y_true = lab2string[example['pol']]
    return {'majority':lab2string[mc_pols]}

In [128]:
test_set = test_set.map(random_baseline)
test_set = test_set.map(majority_baseline)

  0%|          | 0/1000 [00:00<?, ?ex/s]

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [129]:
test_set[0]

{'year': 1839,
 'nlp': 2194,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': "such mea-sures, that triumphant majority which must carry themthrough every species of tory tribulatinn..—(cheer.s.)these are the views that i advocate—this theorpoislie.iytthat i would pursue. is it a dangerous.poliey,not recommended. by its moderation, by its good sense.?and if so, are we to support that government whichopposed to its recomnition or do you sanction hostility.to a government 'which 'will not adopt those.views,even though it may lead to a catastrophe of which theeffects are speculative ? i consider the prospect beforeus is bright, and that we have only to pursue a steadycourse of well doing, and that:your representatives oughtto enforce such sentiments as",
 'ocr': 0.9769,
 'masked_POL': "[MASK] [POL] such mea-sures, that triumphant majority which must carry themthrough every species of tory tribulatinn..—(cheer.s.)these are the views that i advocate—this theorpoislie.iytthat i would pursu

In [130]:
results = test_set.remove_columns(['nlp', 'loc'])

In [131]:
results_df = results.to_pandas()

In [132]:
for ch,res in result_dict.items():
    results_df[ch] = res

In [133]:
results_df

Unnamed: 0,year,pol,sentences,ocr,masked_POL,masked_COMB,random,majority,bnert-pol-st,bnert-pol,bnert-comb
0,1839,[lib],"such mea-sures, that triumphant majority which...",0.9769,"[MASK] [POL] such mea-sures, that triumphant m...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] such m...,liberal,liberal,[lib],liberal,liberal
1,1810,[none],m4t u 'ation'phis day was published. in two ha...,0.7388,[MASK] [POL] m4t u 'ation'phis day was publish...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] m4t u ...,liberal,liberal,[none],none,none
2,1842,[con],"8s and 15s 6doriginal testimonials, gratis, wi...",0.9446,[MASK] [POL] 8s and 15s 6doriginal testimonial...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] 8s and...,liberal,liberal,[lib],liberal,liberal
3,1841,[lib],"american corn.—(loud cries of ""hear, hear,"" an...",0.9324,"[MASK] [POL] american corn.—(loud cries of ""he...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] americ...,liberal,liberal,[lib],liberal,liberal
4,1844,[lib],"all offices, except in cases otherwise pro-d i...",0.7492,"[MASK] [POL] all offices, except in cases othe...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] all of...,liberal,liberal,[lib],liberal,liberal
...,...,...,...,...,...,...,...,...,...,...,...
995,1860,[lib],the personal interview about to takeplace betw...,0.9564,[MASK] [POL] the personal interview about to t...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] the pe...,liberal,liberal,[lib],liberal,liberal
996,1837,[lib],"he could bring himself to,,,siderations. in mu...",0.8137,"[MASK] [POL] he could bring himself to,,,sider...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] he cou...,liberal,liberal,[lib],liberal,liberal
997,1852,[lib],"- . .for h idwith theg qualities., s n sal _po...",0.6146,[MASK] [POL] - . .for h idwith theg qualities....,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] - . .f...,liberal,liberal,[lib],liberal,liberal
998,1839,[lib],"challengedby the counsel for the prosecution),...",0.8009,[MASK] [POL] challengedby the counsel for the ...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] challe...,liberal,liberal,[lib],liberal,liberal


In [134]:
results_df.replace({"bnert-pol-st": lab2string},inplace=True)
results_df.replace({"pol": lab2string},inplace=True)

In [135]:
results_df

Unnamed: 0,year,pol,sentences,ocr,masked_POL,masked_COMB,random,majority,bnert-pol-st,bnert-pol,bnert-comb
0,1839,liberal,"such mea-sures, that triumphant majority which...",0.9769,"[MASK] [POL] such mea-sures, that triumphant m...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] such m...,liberal,liberal,liberal,liberal,liberal
1,1810,none,m4t u 'ation'phis day was published. in two ha...,0.7388,[MASK] [POL] m4t u 'ation'phis day was publish...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] m4t u ...,liberal,liberal,none,none,none
2,1842,conservative,"8s and 15s 6doriginal testimonials, gratis, wi...",0.9446,[MASK] [POL] 8s and 15s 6doriginal testimonial...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] 8s and...,liberal,liberal,liberal,liberal,liberal
3,1841,liberal,"american corn.—(loud cries of ""hear, hear,"" an...",0.9324,"[MASK] [POL] american corn.—(loud cries of ""he...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] americ...,liberal,liberal,liberal,liberal,liberal
4,1844,liberal,"all offices, except in cases otherwise pro-d i...",0.7492,"[MASK] [POL] all offices, except in cases othe...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] all of...,liberal,liberal,liberal,liberal,liberal
...,...,...,...,...,...,...,...,...,...,...,...
995,1860,liberal,the personal interview about to takeplace betw...,0.9564,[MASK] [POL] the personal interview about to t...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] the pe...,liberal,liberal,liberal,liberal,liberal
996,1837,liberal,"he could bring himself to,,,siderations. in mu...",0.8137,"[MASK] [POL] he could bring himself to,,,sider...",[MASK] [DATE] [MASK] [POL] [MASK] [LOC] he cou...,liberal,liberal,liberal,liberal,liberal
997,1852,liberal,"- . .for h idwith theg qualities., s n sal _po...",0.6146,[MASK] [POL] - . .for h idwith theg qualities....,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] - . .f...,liberal,liberal,liberal,liberal,liberal
998,1839,liberal,"challengedby the counsel for the prosecution),...",0.8009,[MASK] [POL] challengedby the counsel for the ...,[MASK] [DATE] [MASK] [POL] [MASK] [LOC] challe...,liberal,liberal,liberal,liberal,liberal


In [136]:
results_df.to_csv('tables/pol_predict_random.csv')

In [137]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
result_table = defaultdict(dict)
for m in ['random','majority','bnert-pol-st','bnert-pol','bnert-comb']:
    for metric in [ f1_score, precision_score, recall_score]:
        result_table[m][metric.__name__] = metric(results_df['pol'],results_df[m],average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


In [138]:
print(pd.DataFrame(result_table).T.round(3).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  f1\_score &  precision\_score &  recall\_score \\
\midrule
random       &     0.204 &            0.203 &         0.210 \\
majority     &     0.172 &            0.151 &         0.200 \\
bnert-pol-st &     0.467 &            0.709 &         0.397 \\
bnert-pol    &     0.469 &            0.668 &         0.412 \\
bnert-comb   &     0.470 &            0.713 &         0.402 \\
\bottomrule
\end{tabular}



  print(pd.DataFrame(result_table).T.round(3).to_latex())


In [72]:
for c in results_df.columns:
    if c.startswith('bnert'):
        results_df[f'diff-{c}'] = results_df.apply(lambda x: abs(x.year - int(x[c].lstrip('[').rstrip(']'))),
                                              axis=1)

In [73]:
results_df.to_csv('tables/tm_year_predict_no_dates.csv')

In [74]:
print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())

\begin{tabular}{lr}
\toprule
{} &       0 \\
\midrule
diff\_random                 &  19.458 \\
diff\_majority               &  13.929 \\
diff-bnert-time-st-y        &   9.744 \\
diff-bnert-time-y           &   8.625 \\
diff-bnert-time-y\_masked\_25 &   7.756 \\
diff-bnert-time-y\_masked\_75 &   6.317 \\
\bottomrule
\end{tabular}



  print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())


In [38]:
print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())

\begin{tabular}{lr}
\toprule
{} &       0 \\
\midrule
diff\_random                 &  19.602 \\
diff\_majority               &  13.096 \\
diff-bnert-time-st-y        &   9.420 \\
diff-bnert-time-y           &   8.235 \\
diff-bnert-time-y\_masked\_25 &   7.353 \\
diff-bnert-time-y\_masked\_75 &   6.066 \\
\bottomrule
\end{tabular}



  print(results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_latex())


## Inspect Results
    

In [77]:
results_df_nodates = pd.read_csv('tables/tm_year_predict_no_dates.csv')
results_df_withdates = pd.read_csv('tables/tm_year_predict_standard.csv')

In [84]:
print(pd.concat([results_df_nodates[[c for c in results_df_nodates.columns if c.startswith('diff')]].mean(axis=0),
           results_df_withdates[[c for c in results_df_nodates.columns if c.startswith('diff')]].mean(axis=0)],axis=1
                 ).round(3).to_latex())

\begin{tabular}{lrr}
\toprule
{} &       0 &       1 \\
\midrule
diff\_random                 &  19.458 &  19.602 \\
diff\_majority               &  13.929 &  13.096 \\
diff-bnert-time-st-y        &   9.744 &   9.420 \\
diff-bnert-time-y           &   8.625 &   8.235 \\
diff-bnert-time-y\_masked\_25 &   7.756 &   7.353 \\
diff-bnert-time-y\_masked\_75 &   6.317 &   6.066 \\
\bottomrule
\end{tabular}



  print(pd.concat([results_df_nodates[[c for c in results_df_nodates.columns if c.startswith('diff')]].mean(axis=0),


# Fin.

In [2]:
import pandas as pd
results_df = pd.read_csv('tables/year_pred.csv')

In [3]:
results_df[[c for c in results_df.columns if c.startswith('diff')]].mean(axis=0).to_csv('tables/year_pred.csv')

In [37]:
results_df

Unnamed: 0,year,nlp,sentences,ocr,diff_random,diff_majority,bnert-time-st-y,bnert-time-y,bnert-time-y_masked_25,bnert-time-y_masked_75,diff-bnert-time-st-y,diff-bnert-time-y,diff-bnert-time-y_masked_25,diff-bnert-time-y_masked_75
0,1859,2084,"sitenirranv, roy institute as tou ry, v. ireno...",0.6430,22,11,[1859],1859,1860,1851,0,0,1,8
1,1850,2194,"atveze,and she had a heavy beam-sea during the...",0.8156,8,12,[1853],1853,1853,1853,3,3,3,3
2,1823,2194,withsuch regulations as we have alluded to. we...,0.9192,24,11,[1844],1844,1844,1844,21,21,21,21
3,1848,2642,atthe several statutes and proceedings of the ...,0.8049,21,5,[1831],1831,1831,1831,17,17,17,17
4,1848,2194,"paid, to any address, secure fromobservation, ...",0.8958,3,15,[1844],1844,1846,1848,4,4,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1846,2194,"thumb takes the giants down apeg or two—"" from...",0.7595,0,35,[1856],1850,1849,1844,10,4,3,2
996,1823,2194,"price of raw fat, per stone of glh.--d""'6',dpr...",0.6541,31,17,[1846],1847,1847,1831,23,24,24,8
997,1847,2194,of the various disorders arising from the foll...,0.5860,7,10,[1845],1846,1845,1845,2,1,2,2
998,1845,2194,"cinctitatt7wirmeon, friday.—the queen promenad...",0.9586,37,9,[1844],1844,1844,1844,1,1,1,1


In [None]:
batches = get_sent_batches(test_set)

In [None]:
batches = list(batches)

In [None]:
len(batches[0])

In [None]:
batch = batches[0]
len(batch)

In [None]:
batch[0]

In [None]:
inputs = tokenizer(batch, return_tensors='pt', padding='longest')
outputs = model(**inputs)

In [None]:
tokenizer.decode(outputs.logits[:,1,:].argmax(dim=-1))

In [None]:
mask_filler('[MASK] [SEP] Her Majesty the Queen.')

In [None]:
years = np.array(test_data['train']['year'])
mc_year = Counter(years).most_common(1)[0][0]; mc_year

In [None]:
np.mean(test_set['diff_random'])

In [None]:
np.mean(test_set['diff_majority'])

In [None]:
def diff_first_prediction(example):
    #try:
        text = '[MASK] [MET] ' + example['sentences']
        tokenized = tokenizer(text)
        if len(tokenized['input_ids']) > 512:
            print(len(text))
            print(text)
            text = tokenizer.decode(tokenized['input_ids'][:500]) + ' [SEP]'
            print(text)
            print(len(text))
        predictions = mask_filler(text)
        target_year = int(example['year'])
        print(predictions)
        pred_year = predictions[0]['token_str'].rstrip(']').lstrip('[')
        print(pred_year)
        return {'diff':abs(target_year-int(pred_year))}


In [None]:
test_set = test_set.map(diff_first_prediction)

In [None]:
data = test_set.remove_columns(['nlp', 'pol', 'sentences',])
data = data.to_pandas()
data['diff'].mean()

In [None]:
data['diff_majority'] = test_set['diff_majority']
data['diff_random'] = test_set['diff_random']
data[['diff_majority','diff_random','diff']].plot(kind='density')

In [None]:
import pandas as pd
pd.DataFrame()

In [None]:
sns.scatterplot(x='year',y='diff',data=data)

In [None]:

X = data[['year','ocr']]
y = data['diff']
reg = LinearRegression().fit(X, y)


In [None]:
reg.score(X, y)

In [None]:
import numpy as np
np.mean(test_set['diff'])

In [None]:
predictions = mask_filler('[MASK] [SEP] Hello, my Queen.')

In [None]:
predictions

## Masking Pipeline

In [None]:
"Mr. Gladstone might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
"Mr. Disraeli, however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."


sent = "The Prime Minister, Mr. [MASK] might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye."
#sent = "Mr. Peel might be observed on the ministerial side of thehouse, making every sort of parliamentary endeavour to catch the Speaker's eye. Mr. [MASK], however, preserved so much of his prerogtive as the hitherto recognised leader of her Majesty's Opposition as to obtain without difficulty the right of pre-audience."



In [None]:

tokenizer = AutoTokenizer.from_pretrained("/datadrive_2/bnert_time")
mask_filler = pipeline(
    "fill-mask", model="/datadrive_2/bnert_time", top_k=5, tokenizer=tokenizer
)


In [None]:
text = f"1830 [SEP] {sent}"
#text = '[MASK] [SEP] His Majesty spoke to the people.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

## Loading Model and Dataset

### Dataset

In [None]:
cache_dir = '/datadrive_2/hf_cache/'
dataset = load_dataset("davanstrien/hmd_newspapers", cache_dir=cache_dir)


In [None]:
import re
pattern = re.compile(r'(\bprime\sminister\b)', re.I)
#pattern.findall("gladstone  d'isreali")

In [None]:
prm = dataset.filter(lambda x: len(pattern.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
a = prm.filter(lambda x: x['date'].year > 1850)

In [None]:
pattern1 = re.compile(r'(\bgladstone|\bisreali\b)', re.I)
prm1 = a.filter(lambda x: len(pattern1.findall(x['text'])) > 0 , num_proc=12) 

In [None]:
prm1['train'][4]

In [None]:
preds = [mask_filler('[MASK] [SEP] '+ text[:900]) for text in prm['train']['text'][:100]]

In [None]:
#preds

In [None]:
### Model

In [None]:
model_checkpoint = "bnert"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

## Extracting Vectors