In [14]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from datasets import load_from_disk
from collections import defaultdict
import torch
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [15]:
dataset = load_from_disk('/datadrive_2/frozen_corpus')

test_data = dataset['test']

In [16]:
test_data = test_data.map(lambda examples: {'sentences': [x.lower() for x in examples['sentences']]}, batched=True)

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-573058092beebe7f.arrow


In [29]:
lab2token = {'[lib]':'liberal', '[con]':'conservative', '[none]':'none', '[rad]':'radical', '[neutr]':'neutral'}

def pred_data(example):
    return {'st_pol_sent': f'{example["pol"]}' + ' [POL] ' + example['sentences'] ,
            'pol_sent': lab2token[example['pol']] + ' [POL] ' + example['sentences'] ,
         
    }
    
test_data = test_data.map(pred_data , num_proc=6)

            

#0:   0%|          | 0/417 [00:00<?, ?ex/s]

#1:   0%|          | 0/417 [00:00<?, ?ex/s]

#2:   0%|          | 0/417 [00:00<?, ?ex/s]

#3:   0%|          | 0/417 [00:00<?, ?ex/s]

#4:   0%|          | 0/416 [00:00<?, ?ex/s]

#5:   0%|          | 0/416 [00:00<?, ?ex/s]

In [46]:
test_data = test_data.shuffle(seed=42).select(range(1000))

In [47]:
test_data

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length', 'st_pol_sent', 'pol_sent', 'loss_bnert-pol-st', 'loss_bnert-pol'],
    num_rows: 1000
})

In [48]:
test_data[0]

{'year': 1849,
 'nlp': 2194,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': "don't you cry for me,i'm going to moses' warehouse, as the best that i can see.i'll also buy a paletot,for ordinary wear,and sure i am that i shall getau out-and-out affair.moses and son i hear have gota plentiful supply.and none can equal it, 'us said,so rivals don't you try.no, poor rivals, don't you try for me.i'm going to moses' warehouse, as the best that i can see.coat, vest, and trowsers, boots and hat,i'll buy without demur,my lady, too, shall have a muff,for i must think of her.i know the prices here are low,although the goods rank high,and none can contradict",
 'ocr': 0.8797,
 'length': 100,
 'st_pol_sent': "[lib] [POL] don't you cry for me,i'm going to moses' warehouse, as the best that i can see.i'll also buy a paletot,for ordinary wear,and sure i am that i shall getau out-and-out affair.moses and son i hear have gota plentiful supply.and none can equal it, 'us said,so rivals don't you try.no,

In [49]:
checkpoints = [#('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               #('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-pol-st','/datadrive_2/bnert-pol-st','[POL]','st_pol_sent'),
               ('bnert-pol','/datadrive_2/bnert-pol','[POL]','pol_sent'),
               #('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               #('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date')
]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForMaskedLM.from_pretrained(checkpoint)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

Code adapted from this [Stack Overflow](
https://stackoverflow.com/questions/70464428/how-to-calculate-perplexity-of-a-sentence-using-huggingface-masked-language-mode)

In [50]:
def pseudo_perplexity(example, sent_col, name, model, tokenizer):
    tensor_input = tokenizer.encode(example[sent_col], return_tensors='pt',truncation=True, max_length=128)
    #print(tensor_input.shape)
    #if with_meta:
    repeat_input = tensor_input.repeat(tensor_input.size(-1)-4, 1)
    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[2:-2]
    #else:
    #    repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
    #    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
    with torch.inference_mode():
        loss = model(masked_input, labels=labels).loss
    return {f'loss_{name}':np.exp(loss.item())}

In [51]:
for name, ndict in model_dict.items():
    print(f'Evaluating {name}')
    test_data = test_data.map(pseudo_perplexity, 
                              #num_proc=3,
                              fn_kwargs={'sent_col':ndict['sentences'],
                                        'name': name,
                                        'model':ndict['model'],
                                        'tokenizer':ndict['tokenizer']  
                                   }
                             )

Evaluating bnert-pol-st


  0%|          | 0/1000 [00:00<?, ?ex/s]

Evaluating bnert-pol


  0%|          | 0/1000 [00:00<?, ?ex/s]

In [52]:
results = test_data.remove_columns(['nlp','loc','length'])

In [53]:
results_df = results.to_pandas()

In [54]:
results_df.shape

(1000, 8)

In [55]:
results_df['pol'] = results_df.pol.apply(lambda x: x.lstrip('[').rstrip(']'))

In [58]:
results_df[[c for c in results_df.columns if c.startswith('loss')]].std(axis=0)

loss_bnert-pol-st    44.998926
loss_bnert-pol       44.995695
dtype: float64

In [304]:
!ls tables

classsify_pol_regex	  pseudo_perplexity_1000ex_l28.csv  year_pred.csv
classsify_pol_with_regex  pseudo_perplexity_2500ex_64.csv


In [305]:
results_df_128 = pd.read_csv('tables/pseudo_perplexity_1000ex_l28.csv')
results_df_64 = pd.read_csv('tables/pseudo_perplexity_2500ex_64.csv')

In [293]:
#results_df.to_csv('tables/pseudo_perplexity_2500ex_64.csv')

In [306]:
results_df_64['dec'] = results_df_64.year.apply(lambda x: int(str(x)[:3]+'0'))
results_df_128['dec'] = results_df_128.year.apply(lambda x: int(str(x)[:3]+'0'))

In [327]:
scores_64 = results_df_64[[c for c in results_df_64.columns if c.startswith('loss')]].sum(axis=0) 
scores_128 = results_df_128[[c for c in results_df_128.columns if c.startswith('loss')]].sum(axis=0) 


In [328]:
print(pd.concat([scores_64,scores_128],axis=1).round(0).to_latex())

\begin{tabular}{lrr}
\toprule
{} &        0 &         1 \\
\midrule
loss\_hmd\_distilbert         &  82338.0 &   25724.0 \\
loss\_bnert-time-st-y        &  79231.0 &   25030.0 \\
loss\_bnert-time-y           &  78720.0 &   24970.0 \\
loss\_bnert-time-y\_masked\_25 &  77424.0 &   24589.0 \\
loss\_bnert-time-y\_masked\_75 &  77553.0 &   24630.0 \\
loss\_distilbert             &      NaN &  229192.0 \\
\bottomrule
\end{tabular}



  print(pd.concat([scores_64,scores_128],axis=1).round(0).to_latex())


In [324]:
#results_df_64

In [322]:

results_df_64 = pd.get_dummies(results_df_64, columns=['pol'])
#results_df_64

KeyError: "None of [Index(['pol'], dtype='object')] are in the [columns]"

In [325]:
results_df_64['tm25'] = results_df_64["loss_bnert-time-y_masked_25"]
mod = smf.ols(formula='tm25 ~ ocr + dec + pol_con + pol_lib + pol_neutr + pol_none + pol_rad', data=results_df_64)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,tm25,R-squared:,0.323
Model:,OLS,Adj. R-squared:,0.321
Method:,Least Squares,F-statistic:,197.9
Date:,"Wed, 31 Aug 2022",Prob (F-statistic):,1.11e-206
Time:,14:06:49,Log-Likelihood:,-13357.0
No. Observations:,2500,AIC:,26730.0
Df Residuals:,2493,BIC:,26770.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-160.4724,110.271,-1.455,0.146,-376.706,55.761
ocr,-254.6803,8.052,-31.629,0.000,-270.470,-238.891
dec,0.2405,0.073,3.305,0.001,0.098,0.383
pol_con,-35.1428,22.617,-1.554,0.120,-79.492,9.206
pol_lib,-37.0012,22.908,-1.615,0.106,-81.922,7.920
pol_neutr,-46.6745,24.122,-1.935,0.053,-93.976,0.627
pol_none,-25.4700,20.880,-1.220,0.223,-66.415,15.475
pol_rad,-16.1839,21.308,-0.760,0.448,-57.966,25.598

0,1,2,3
Omnibus:,2439.137,Durbin-Watson:,2.01
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151150.049
Skew:,4.605,Prob(JB):,0.0
Kurtosis:,39.962,Cond. No.,3e+19


# Fin.

In [88]:
# def score_random_mask(model, tokenizer, sentence,meta_pos=None):
#     tensor_input = tokenizer.encode(sentence, return_tensors='pt')
#     #print(tensor_input)
#     repeat_input = torch.clone(tensor_input)
#     #print(repeat_input)
#     sum_mask,i = 0,0
#     while sum_mask == 0:
#         mask = torch.tensor(np.random.binomial(1, .15, repeat_input.shape[1]))
#         sum_mask = sum(mask)
        
#     if meta_pos:
#         mask[meta_pos] = 0
#     masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
#     print(masked_input)
#     labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
#     print(labels)
#     with torch.inference_mode():
#         loss = model(masked_input, labels=labels).loss
#     return np.exp(loss.item())

In [None]:
# from transformers import Trainer

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=lm_datasets["train"],
#     eval_dataset=lm_datasets["test"],
#     data_collator=data_collator,
# )