In [1]:
import torch
import seaborn as sns
import pandas as pd
import transformers
import evaluate
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from transformers import DataCollatorWithPadding
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LinearRegression
sns.set()

In [2]:
#dataset = load_from_disk('/datadrive_2/')
#test_data = dataset['test']

cache_dir = '/datadrive_2/hf_cache/'
dataset = load_from_disk("/datadrive_2/HMD_context")
dataset = dataset['train']

In [3]:
dataset

Dataset({
    features: ['text', 'title', 'location', 'date', 'ocr_quality_mean', 'year', 'length', 'month', 'nlp', 'pol', 'loc'],
    num_rows: 2787067
})

## Classify by Political Leaning

In [4]:
pol_pattern = re.compile(r'\bliberal|\bconservat|\bministers?|\btory\b|\btories\b|\bgovernments?|\bpolitic',re.I)

In [5]:
pol_pattern.findall('liberal governments do not fire their political ministers minister')

['liberal', 'governments', 'politic', 'ministers', 'minister']

In [6]:
def sent_split(x):
     return {'data': [
                {'sentence':s.lower(),
                 'length': len(s.split()),
                 'pol': p, 'loc':l, 'year':y, 'ocr':o,'nlp':n} 
                     for y,p,l,o,n,t in zip(x['year'],x['pol'],x['loc'],x['ocr_quality_mean'],x['nlp'],x['text']) 
                      for s in sent_tokenize(t) 
                         if pol_pattern.findall(s)
                 ]
            }

test_data = dataset.map(sent_split,batched=True, remove_columns=dataset.column_names)

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-ba07e195c8ec9d10.arrow


In [7]:
test_data = test_data.flatten()

In [8]:
test_data[10]

{'data.length': 90,
 'data.loc': '[liverpool]',
 'data.nlp': 2083,
 'data.ocr': 0.985,
 'data.pol': '[neutr]',
 'data.sentence': "but we must yet be on the alert.her majesty's government has, with praise-worthy alacrity, already directed, by an order incouncil, that the provisions of the act for the pre-vention of epidemic, indemic, and contagious dis-eases, be immediately put in force through thewhole of great britain; and liverpool is not behind-hand in giving effect to so necessary, so indispen-sable a precaution.the very attention which has in this emergencyto be given to such a subject as that of the publichealth, may result in a permanent improvement ofthe sanitary state of our town.",
 'data.year': 1853}

In [9]:
data = test_data.filter(lambda x: x['data.length'] > 25).shuffle(seed=42).select(range(15000))

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-a30da7bf93707de5.arrow
Loading cached shuffled indices for dataset at /datadrive_2/HMD_context/train/cache-ef1d6e941b6c7082.arrow


In [10]:
def pred_data(example,add_field='data.year'):
    return {'st_year_sep': f'[{example[add_field]}]' + ' [SEP] ' + example['data.sentence'] ,
     'year_sep': str(example[add_field]) + ' [SEP] ' + example['data.sentence'] ,
     'year_date': str(example[add_field]) + ' [DATE] ' + example['data.sentence'],
        
    }
    
data = data.map(pred_data , num_proc=6)

       

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-f8d0db47c344d25a.arrow


  

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-798f6926ed263cdc.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-673e9f15eeac8d91.arrow
Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-0552d9181d587ab8.arrow


  

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-8053603b608b107e.arrow
Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-67771e8f0317eefe.arrow


In [11]:
len(data)

15000

In [12]:
lab2code = {'[con]':0,'[lib]':1,'[rad]':2,'[neutr]':3,'[none]':4}
num_labels = len(lab2code)
data = data.map(lambda x: {'label': lab2code[x['data.pol']]})

Loading cached processed dataset at /datadrive_2/HMD_context/train/cache-29691a666b941845.arrow


In [13]:
data[0]

{'data.length': 120,
 'data.loc': '[london]',
 'data.nlp': 2194,
 'data.ocr': 0.9156,
 'data.pol': '[lib]',
 'data.sentence': "l'pon payment of thearrear of the premiums, the assured will be entitled to a fall parti-cipation of the profits.the assured may at all times borrow an amount equal to the valueof the policy, on its security.in all the transactions of the company the utmost liberality willbe exercised.life assurances will be effected with all the advantages to theassured afforded by the most respectable established companies.a table of premiums will shortly be published.tontine annuities.- • -who from the hopethis company, in subinittini the subject of tontine annuities tothe public, is intttenegdenbgyevtihtey,baenlideftntehadtestihreereer aearesemanandyafflperasersiee,as they progress throughb life, will gladly avail themselves of thism .d eaonfai nivneesitemaserntg;---inzrawhich they will secure a large remunera-tiveooffttheir nominee.guaranteed to them by the companyduring t

In [14]:
test_size = int(len(data)*.2)
train_test = data.train_test_split(test_size=test_size, seed=1984)
test_set = train_test['test']
val_size = int(len(train_test['train'])*.15)
train_val =  train_test['train'].train_test_split(test_size=val_size, seed=1984)

Loading cached split indices for dataset at /datadrive_2/HMD_context/train/cache-1445f72e23a5c03c.arrow and /datadrive_2/HMD_context/train/cache-b3b647c8217a2d35.arrow
Loading cached split indices for dataset at /datadrive_2/HMD_context/train/cache-bae83ef1b0045f65.arrow and /datadrive_2/HMD_context/train/cache-c691a7871720a3af.arrow


In [15]:
train_val

DatasetDict({
    train: Dataset({
        features: ['data.length', 'data.loc', 'data.nlp', 'data.ocr', 'data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 10200
    })
    test: Dataset({
        features: ['data.length', 'data.loc', 'data.nlp', 'data.ocr', 'data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 1800
    })
})

In [16]:
checkpoints = [('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               ('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date'),
               ('bnert-pol-st','/datadrive_2/bnert-pol-st','[SEP]','year_sep'),
               ('bnert-pol','/datadrive_2/bnert-pol','[SEP]','year_sep')]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifi

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /datadrive_2/bnert-pol and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
train_val = train_val.remove_columns(['data.nlp', 'data.ocr','data.length', 'data.loc'])
train_val

DatasetDict({
    train: Dataset({
        features: ['data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 10200
    })
    test: Dataset({
        features: ['data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 1800
    })
})

In [18]:
#def add_text_col(example,source):
#    return {'text' : example[source]}

def preprocess_function(examples, target_col):
    return tokenizer(examples[target_col], truncation=True )

In [19]:
result_dict = defaultdict(dict)

for name, mdict in model_dict.items():
    print(f'Creating a model for {name}')
    tokenizer = model_dict[name]['tokenizer']
    model = model_dict[name]['model']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    sent_col = model_dict[name]['sentences']
    
    #train_val = train_val.map(add_text_col,fn_kwargs={'source': sent_col})
    train_val = train_val.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    
    training_args = TrainingArguments(
    seed=1984,
    output_dir=f"./results_{name}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
        )


    trainer.train()
    
    model.save_pretrained(f'/datadrive_2/{name}-pol')
    tokenizer.save_pretrained(f"/datadrive_2/{name}-pol")
    
    test_set = test_set.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    predictions = trainer.predict(test_set)
    preds = np.argmax(predictions.predictions, axis=-1)
    result_dict[name]['f1_macro'] = f1_score(preds,predictions.label_ids,average='macro')
    result_dict[name]['f1_micro'] = f1_score(preds,predictions.label_ids,average='micro')
    result_dict[name]['accuracy']  = accuracy_score(preds,predictions.label_ids)

Creating a model for distilbert


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5957
1000,0.4573
1500,0.3999
2000,0.344
2500,0.2635
3000,0.1968


Saving model checkpoint to ./results_distilbert/checkpoint-500
Configuration saved in ./results_distilbert/checkpoint-500/config.json
Model weights saved in ./results_distilbert/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_distilbert/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_distilbert/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_distilbert/checkpoint-1000
Configuration saved in ./results_distilbert/checkpoint-1000/config.json
Model weights saved in ./results_distilbert/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_distilbert/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_distilbert/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_distilbert/checkpoint-1500
Configuration saved in ./results_distilbert/checkpoint-1500/config.json
Model weights saved in ./results_distilbert/checkpoint-1500/pytorch_model.bin
tok

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for hmd_distilbert


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5969
1000,0.4416
1500,0.3776
2000,0.3081
2500,0.2377
3000,0.1742


Saving model checkpoint to ./results_hmd_distilbert/checkpoint-500
Configuration saved in ./results_hmd_distilbert/checkpoint-500/config.json
Model weights saved in ./results_hmd_distilbert/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_hmd_distilbert/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_hmd_distilbert/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_hmd_distilbert/checkpoint-1000
Configuration saved in ./results_hmd_distilbert/checkpoint-1000/config.json
Model weights saved in ./results_hmd_distilbert/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_hmd_distilbert/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_hmd_distilbert/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_hmd_distilbert/checkpoint-1500
Configuration saved in ./results_hmd_distilbert/checkpoint-1500/config.json
Model weights saved in ./results_

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for bnert-time-st-y


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5863
1000,0.4481
1500,0.383
2000,0.3159
2500,0.236
3000,0.1693


Saving model checkpoint to ./results_bnert-time-st-y/checkpoint-500
Configuration saved in ./results_bnert-time-st-y/checkpoint-500/config.json
Model weights saved in ./results_bnert-time-st-y/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-st-y/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-st-y/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-st-y/checkpoint-1000
Configuration saved in ./results_bnert-time-st-y/checkpoint-1000/config.json
Model weights saved in ./results_bnert-time-st-y/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-st-y/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-st-y/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-st-y/checkpoint-1500
Configuration saved in ./results_bnert-time-st-y/checkpoint-1500/config.json
Model weights saved i

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for bnert-time-y


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5691
1000,0.4361
1500,0.3641
2000,0.294
2500,0.2192
3000,0.1607


Saving model checkpoint to ./results_bnert-time-y/checkpoint-500
Configuration saved in ./results_bnert-time-y/checkpoint-500/config.json
Model weights saved in ./results_bnert-time-y/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-y/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-y/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-y/checkpoint-1000
Configuration saved in ./results_bnert-time-y/checkpoint-1000/config.json
Model weights saved in ./results_bnert-time-y/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-y/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-y/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-y/checkpoint-1500
Configuration saved in ./results_bnert-time-y/checkpoint-1500/config.json
Model weights saved in ./results_bnert-time-y/checkpoint-

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for bnert-time-y_masked_25


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5691
1000,0.4327
1500,0.3559
2000,0.2874
2500,0.213
3000,0.1561


Saving model checkpoint to ./results_bnert-time-y_masked_25/checkpoint-500
Configuration saved in ./results_bnert-time-y_masked_25/checkpoint-500/config.json
Model weights saved in ./results_bnert-time-y_masked_25/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-y_masked_25/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-y_masked_25/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-y_masked_25/checkpoint-1000
Configuration saved in ./results_bnert-time-y_masked_25/checkpoint-1000/config.json
Model weights saved in ./results_bnert-time-y_masked_25/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-y_masked_25/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-y_masked_25/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-y_masked_25/checkpoint-1500
Configuration saved i

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for bnert-time-y_masked_75


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5677
1000,0.4296
1500,0.3544
2000,0.2875
2500,0.2191
3000,0.1563


Saving model checkpoint to ./results_bnert-time-y_masked_75/checkpoint-500
Configuration saved in ./results_bnert-time-y_masked_75/checkpoint-500/config.json
Model weights saved in ./results_bnert-time-y_masked_75/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-y_masked_75/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-y_masked_75/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-y_masked_75/checkpoint-1000
Configuration saved in ./results_bnert-time-y_masked_75/checkpoint-1000/config.json
Model weights saved in ./results_bnert-time-y_masked_75/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-time-y_masked_75/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-time-y_masked_75/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-time-y_masked_75/checkpoint-1500
Configuration saved i

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for bnert-pol-st


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5812
1000,0.4363
1500,0.3677
2000,0.3028
2500,0.2332
3000,0.1719


Saving model checkpoint to ./results_bnert-pol-st/checkpoint-500
Configuration saved in ./results_bnert-pol-st/checkpoint-500/config.json
Model weights saved in ./results_bnert-pol-st/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-pol-st/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-pol-st/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-pol-st/checkpoint-1000
Configuration saved in ./results_bnert-pol-st/checkpoint-1000/config.json
Model weights saved in ./results_bnert-pol-st/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-pol-st/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-pol-st/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-pol-st/checkpoint-1500
Configuration saved in ./results_bnert-pol-st/checkpoint-1500/config.json
Model weights saved in ./results_bnert-pol-st/checkpoint-

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


Creating a model for bnert-pol


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date. If data.year, year_sep, data.sentence, st_year_sep, data.pol, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.592
1000,0.4296
1500,0.36
2000,0.2972
2500,0.226
3000,0.1695


Saving model checkpoint to ./results_bnert-pol/checkpoint-500
Configuration saved in ./results_bnert-pol/checkpoint-500/config.json
Model weights saved in ./results_bnert-pol/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-pol/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-pol/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-pol/checkpoint-1000
Configuration saved in ./results_bnert-pol/checkpoint-1000/config.json
Model weights saved in ./results_bnert-pol/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-pol/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-pol/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-pol/checkpoint-1500
Configuration saved in ./results_bnert-pol/checkpoint-1500/config.json
Model weights saved in ./results_bnert-pol/checkpoint-1500/pytorch_model.bin
tokenizer config

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp. If data.length, data.year, year_sep, data.sentence, data.loc, st_year_sep, data.pol, year_date, data.ocr, data.nlp are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


In [20]:
results_df = pd.DataFrame.from_dict(result_dict, orient='index')

In [21]:
print(results_df.round(3).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  f1\_macro &  f1\_micro &  accuracy \\
\midrule
distilbert             &     0.481 &     0.829 &     0.829 \\
hmd\_distilbert         &     0.549 &     0.833 &     0.833 \\
bnert-time-st-y        &     0.562 &     0.835 &     0.835 \\
bnert-time-y           &     0.550 &     0.843 &     0.843 \\
bnert-time-y\_masked\_25 &     0.568 &     0.840 &     0.840 \\
bnert-time-y\_masked\_75 &     0.523 &     0.831 &     0.831 \\
bnert-pol-st           &     0.545 &     0.830 &     0.830 \\
bnert-pol              &     0.545 &     0.835 &     0.835 \\
\bottomrule
\end{tabular}



  print(results_df.round(3).to_latex())


In [22]:
results_df.to_csv('tables/classsify_pol_regex')