Code adapted from this [Stack Overflow](
https://stackoverflow.com/questions/70464428/how-to-calculate-perplexity-of-a-sentence-using-huggingface-masked-language-mode)

In [247]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import load_from_disk
from collections import defaultdict
import torch
import pandas as pd
import numpy as np


In [276]:
dataset = load_from_disk('/datadrive_2/frozen_corpus')

test_data = dataset['test']

In [277]:
test_data = test_data.map(lambda examples: {'sentences': [x.lower() for x in examples['sentences']]}, batched=True)

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-573058092beebe7f.arrow


In [278]:
def pred_data(example):
    return {'st_year_sep': f'[{example["year"]}]' + ' [SEP] ' + example['sentences'] ,
     'year_sep': str(example['year']) + ' [SEP] ' + example['sentences'] ,
     'year_date': str(example['year']) + ' [DATE] ' + example['sentences'] 
        
    }
    
test_data = test_data.map(pred_data , num_proc=6)

          

#1:   0%|          | 0/96976 [00:00<?, ?ex/s]

#0:   0%|          | 0/96977 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/96976 [00:00<?, ?ex/s]

#3:   0%|          | 0/96976 [00:00<?, ?ex/s]

#4:   0%|          | 0/96976 [00:00<?, ?ex/s]

#5:   0%|          | 0/96976 [00:00<?, ?ex/s]

In [282]:
test_data = test_data.shuffle(seed=42).select(range(2500))

In [283]:
test_data

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length', 'st_year_sep', 'year_sep', 'year_date'],
    num_rows: 2500
})

In [284]:
test_data[0]

{'year': 1867,
 'nlp': 2194,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': 'articlesbelonging to his employers. in answer to the pre-sident of the court before which he has just beentried as to what was his motive for committing therobbery, the prisoner said " nothing else butgormandism." in truth, the search of his lodgingswent far to prove the truth of his assertion, for thepolice found there chocolate, sardines, figs, pre-serves, and other good eatables belonging to hismasters, and also a box of excellent cigars, whichwould afford a pleasant smoke after a relish. it wasevident also that the prisoner was delicate in his habitsand did not eat with his fingers, for plenty of forksand',
 'ocr': 0.9695,
 'length': 100,
 'st_year_sep': '[1867] [SEP] articlesbelonging to his employers. in answer to the pre-sident of the court before which he has just beentried as to what was his motive for committing therobbery, the prisoner said " nothing else butgormandism." in truth, the search of 

In [285]:
checkpoints = [#('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               ('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date')]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForMaskedLM.from_pretrained(checkpoint)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

In [286]:
def pseudo_perplexity(example, sent_col, name, model, tokenizer):
    tensor_input = tokenizer.encode(example[sent_col], return_tensors='pt',truncation=True, max_length=64)
    #print(tensor_input.shape)
    #if with_meta:
    repeat_input = tensor_input.repeat(tensor_input.size(-1)-4, 1)
    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[2:-2]
    #else:
    #    repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
    #    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
    with torch.inference_mode():
        loss = model(masked_input, labels=labels).loss
    return {f'loss_{name}':np.exp(loss.item())}

In [287]:
for name, ndict in model_dict.items():
    print(f'Evaluating {name}')
    test_data = test_data.map(pseudo_perplexity, 
                              #num_proc=3,
                              fn_kwargs={'sent_col':ndict['sentences'],
                                        'name': name,
                                        'model':ndict['model'],
                                        'tokenizer':ndict['tokenizer']  
                                   }
                             )

Evaluating hmd_distilbert


  0%|          | 0/2500 [00:00<?, ?ex/s]

Evaluating bnert-time-st-y


  0%|          | 0/2500 [00:00<?, ?ex/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Evaluating bnert-time-y_masked_25


  0%|          | 0/2500 [00:00<?, ?ex/s]

Evaluating bnert-time-y_masked_75


  0%|          | 0/2500 [00:00<?, ?ex/s]

In [288]:
results = test_data.remove_columns(['nlp','loc','length', 'st_year_sep', 'year_sep', 'year_date'])

In [289]:
results_df = results.to_pandas()

In [290]:
results_df.shape

(2500, 9)

In [291]:
results_df['pol'] = results_df.pol.apply(lambda x: x.lstrip('[').rstrip(']'))

In [292]:
results_df[[c for c in results_df.columns if c.startswith('loss')]].sum(axis=0)

loss_hmd_distilbert            82337.838676
loss_bnert-time-st-y           79230.924210
loss_bnert-time-y              78720.447952
loss_bnert-time-y_masked_25    77423.510455
loss_bnert-time-y_masked_75    77552.556247
dtype: float64

In [293]:
results_df.to_csv('tables/pseudo_perplexity_2500ex_64.csv')

In [294]:
results_df['dec'] = results_df.year.apply(lambda x: int(str(x)[:3]+'0'))

In [295]:

results_df = pd.get_dummies(results_df, columns=['pol'])

In [296]:
results_df.columns

Index(['year', 'sentences', 'ocr', 'loss_hmd_distilbert',
       'loss_bnert-time-st-y', 'loss_bnert-time-y',
       'loss_bnert-time-y_masked_25', 'loss_bnert-time-y_masked_75', 'dec',
       'pol_con', 'pol_lib', 'pol_neutr', 'pol_none', 'pol_rad'],
      dtype='object')

# Fin.

In [297]:
import statsmodels.api as sm

import statsmodels.formula.api as smf


In [302]:
results_df['tm25'] = results_df["loss_bnert-time-y_masked_25"]
mod = smf.ols(formula='tm25 ~ ocr + dec ', data=results_df)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,tm25,R-squared:,0.318
Model:,OLS,Adj. R-squared:,0.317
Method:,Least Squares,F-statistic:,581.0
Date:,"Tue, 30 Aug 2022",Prob (F-statistic):,6.82e-208
Time:,20:19:17,Log-Likelihood:,-13367.0
No. Observations:,2500,AIC:,26740.0
Df Residuals:,2497,BIC:,26760.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,103.7773,108.775,0.954,0.340,-109.521,317.076
ocr,-259.3065,7.824,-33.144,0.000,-274.648,-243.965
dec,0.0798,0.060,1.330,0.184,-0.038,0.197

0,1,2,3
Omnibus:,2430.149,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,147735.521
Skew:,4.585,Prob(JB):,0.0
Kurtosis:,39.527,Cond. No.,197000.0


In [88]:
# def score_random_mask(model, tokenizer, sentence,meta_pos=None):
#     tensor_input = tokenizer.encode(sentence, return_tensors='pt')
#     #print(tensor_input)
#     repeat_input = torch.clone(tensor_input)
#     #print(repeat_input)
#     sum_mask,i = 0,0
#     while sum_mask == 0:
#         mask = torch.tensor(np.random.binomial(1, .15, repeat_input.shape[1]))
#         sum_mask = sum(mask)
        
#     if meta_pos:
#         mask[meta_pos] = 0
#     masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
#     print(masked_input)
#     labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
#     print(labels)
#     with torch.inference_mode():
#         loss = model(masked_input, labels=labels).loss
#     return np.exp(loss.item())

In [None]:
# from transformers import Trainer

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=lm_datasets["train"],
#     eval_dataset=lm_datasets["test"],
#     data_collator=data_collator,
# )