In [1]:
import torch
import seaborn as sns
import pandas as pd
import transformers
import evaluate
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from transformers import DataCollatorWithPadding
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LinearRegression
sns.set()

In [2]:
#pd.read_csv('../animacy.csv')

In [3]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="../LwM-nlp-animacy-annotations-machines19thC.tsv",sep='\t')

Using custom data configuration default-f545cf31ffaba532
Reusing dataset csv (/home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness'],
        num_rows: 594
    })
})

In [5]:
dataset = dataset['train']

## Classify Animacy

In [6]:
def pred_data(example,add_field='Date'):
    return {'st_year_sep': f'[{example[add_field]}]' + ' [SEP] ' + example['Sentence'] ,
     'year_sep': str(example[add_field]) + ' [SEP] ' + example['Sentence'] ,
     'year_date': str(example[add_field]) + ' [DATE] ' + example['Sentence'] 
        
    }
    
dataset = dataset.map(pred_data , num_proc=6)

       

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-b1749679b2d123b3.arrow


 

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-dedb322f8239241e.arrow


 

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-3c8f76cedee53320.arrow


 

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-9407077a15a1c6bd.arrow


  

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-cd985a8ed73dab56.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-18fbd086368f089c.arrow


In [7]:
dataset['st_year_sep'][1]

'[1880] [SEP] Immured in a convent, debarred from life-giving air and light, and the beauty of life, we cease to be living, feeling, thinking girls and women, we become mere ***machines*** who blindly obey the head that directs us.'

In [8]:
len(dataset)

594

In [9]:
lab2code = {label:i for i,label in enumerate(dataset.unique('animacy'))}
num_labels = len(lab2code)
dataset = dataset.map(lambda x: {'label': lab2code[x['animacy']]})

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-620defd230eb9dde.arrow


In [10]:
dataset

Dataset({
    features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness', 'st_year_sep', 'year_sep', 'year_date', 'label'],
    num_rows: 594
})

In [11]:
test_size = int(len(dataset)*.3)
train_test = dataset.train_test_split(test_size=test_size , seed=42)
test_set = train_test['test']
val_size = int(len(train_test['train'])*.05)
train_val =  train_test['train'].train_test_split(test_size=val_size,seed=42)

Loading cached split indices for dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-988d90cc8db4b97a.arrow and /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-a9deefc933d27db9.arrow
Loading cached split indices for dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-d699096f8ae6a9de.arrow and /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-244440c5d9be8e17.arrow


In [12]:
train_test.save_to_disk('/datadrive_2/animacy_split')

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-20e2afe38581158e.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-262c34e8d3b7c2d6.arrow


In [13]:
checkpoints = [('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               ('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date'),
               ('bnert-pol-st','/datadrive_2/bnert-pol-st','[SEP]','year_sep'),
               ('bnert-pol','/datadrive_2/bnert-pol','[SEP]','year_sep')]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /datadrive_2/bnert-pol and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
train_val

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 396
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 20
    })
})

In [15]:
train_val = train_val.remove_columns(['Unnamed: 0','SentenceCtxt', 'SentenceId', 'TargetExpression','animacy', 'humanness'])
train_val

DatasetDict({
    train: Dataset({
        features: ['Date', 'Sentence', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 396
    })
    test: Dataset({
        features: ['Date', 'Sentence', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 20
    })
})

In [16]:
#def add_text_col(example,source):
#    return {'text' : example[source]}

def preprocess_function(examples, target_col):
    return tokenizer(examples[target_col], truncation=True)

In [17]:
result_dict = defaultdict(dict)

for name, mdict in model_dict.items():
    print(f'Creating a model for {name}')
    tokenizer = model_dict[name]['tokenizer']
    model = model_dict[name]['model']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    sent_col = model_dict[name]['sentences']
    
    #train_val = train_val.map(add_text_col,fn_kwargs={'source': sent_col})
    train_val = train_val.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    
    training_args = TrainingArguments(
    output_dir=f"../results_{name}",
    seed = 1984,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
        )


    trainer.train()
    
    model.save_pretrained(f'/datadrive_2/bnert-{name}-animacy')
    tokenizer.save_pretrained(f"/datadrive_2/bnert-{name}-animacy")
    
    
    test_set = test_set.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    predictions = trainer.predict(test_set)
    preds = np.argmax(predictions.predictions, axis=-1)
    result_dict[name]['f1_binary'] = f1_score(preds,predictions.label_ids,average='binary')
    result_dict[name]['f1_macro'] = f1_score(preds,predictions.label_ids,average='macro')
    result_dict[name]['f1_micro'] = f1_score(preds,predictions.label_ids,average='micro')
    result_dict[name]['accuracy']  = accuracy_score(preds,predictions.label_ids)

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-75de57af7530c59c.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-0dcc26c71e076065.arrow


Creating a model for distilbert


The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 396
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 250


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-distilbert-animacy/config.json
Model weights saved in /datadrive_2/bnert-distilbert-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-distilbert-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-distilbert-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-17fffae25a4018bd.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, 

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-9301a155df6f8e0d.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-fe576c773ef4248a.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for hmd_distilbert


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-hmd_distilbert-animacy/config.json
Model weights saved in /datadrive_2/bnert-hmd_distilbert-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-hmd_distilbert-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-hmd_distilbert-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-0bd28c6e0a658c45.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sente

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-eaa9975b99fce62b.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-c2e9fa588bbd36b9.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for bnert-time-st-y


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-bnert-time-st-y-animacy/config.json
Model weights saved in /datadrive_2/bnert-bnert-time-st-y-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-bnert-time-st-y-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-bnert-time-st-y-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-7d55045e23dc2177.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, S

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-26144a288eb105d2.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-1fab93d4fed73bc5.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for bnert-time-y


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-bnert-time-y-animacy/config.json
Model weights saved in /datadrive_2/bnert-bnert-time-y-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-bnert-time-y-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-bnert-time-y-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-22a345cc45f9e99a.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, yea

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-849e366a6fe228de.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-4056cfe1d33a3863.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for bnert-time-y_masked_25


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-bnert-time-y_masked_25-animacy/config.json
Model weights saved in /datadrive_2/bnert-bnert-time-y_masked_25-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-bnert-time-y_masked_25-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-bnert-time-y_masked_25-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-7c1a8ac77c46c123.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpres

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-95fd9f6e08f9a851.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-49bb7480bfc1c774.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for bnert-time-y_masked_75


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-bnert-time-y_masked_75-animacy/config.json
Model weights saved in /datadrive_2/bnert-bnert-time-y_masked_75-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-bnert-time-y_masked_75-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-bnert-time-y_masked_75-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-2342c5b08d9d0741.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpres

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-dfd9aea49fe4b5cc.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-5b1b49cd97ef32d9.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for bnert-pol-st


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-bnert-pol-st-animacy/config.json
Model weights saved in /datadrive_2/bnert-bnert-pol-st-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-bnert-pol-st-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-bnert-pol-st-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-ce1e15633ba2b67a.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, yea

Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-a1e106b5d96a888d.arrow
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-eefbfe5f7b5bfc2d.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_sep, Sentence, year_date, Date, st_year_sep. If year_sep, Sentence, year_date, Date, st_year_sep are not expected by `DistilBertForSequ

Creating a model for bnert-pol


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in /datadrive_2/bnert-bnert-pol-animacy/config.json
Model weights saved in /datadrive_2/bnert-bnert-pol-animacy/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-bnert-pol-animacy/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-bnert-pol-animacy/special_tokens_map.json
Loading cached processed dataset at /home/kbeelen/.cache/huggingface/datasets/csv/default-f545cf31ffaba532/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-74585550206ae520.arrow
The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, animacy, Date, st_year_sep. If Unnamed: 0, SentenceId, SentenceCtxt, TargetExpression, humanness, year_sep, Sentence, year_date, anim

In [18]:
results_df = pd.DataFrame.from_dict(result_dict, orient='index')

In [19]:
print(results_df.round(3).to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  f1\_binary &  f1\_macro &  f1\_micro &  accuracy \\
\midrule
distilbert             &      0.752 &     0.809 &     0.826 &     0.826 \\
hmd\_distilbert         &      0.741 &     0.808 &     0.831 &     0.831 \\
bnert-time-st-y        &      0.762 &     0.816 &     0.831 &     0.831 \\
bnert-time-y           &      0.769 &     0.828 &     0.848 &     0.848 \\
bnert-time-y\_masked\_25 &      0.777 &     0.831 &     0.848 &     0.848 \\
bnert-time-y\_masked\_75 &      0.756 &     0.817 &     0.837 &     0.837 \\
bnert-pol-st           &      0.763 &     0.823 &     0.843 &     0.843 \\
bnert-pol              &      0.752 &     0.815 &     0.837 &     0.837 \\
\bottomrule
\end{tabular}



  print(results_df.round(3).to_latex())


# Fin.