In [1]:
import torch
import seaborn as sns
import pandas as pd
import transformers
import evaluate
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from transformers import DataCollatorWithPadding
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LinearRegression
sns.set()

In [10]:
#pd.read_csv('../animacy.csv')

In [1]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="../revolution.csv",sep=',')

Using custom data configuration default-36ee4eefb063a0df


Downloading and preparing dataset csv/default to /home/kbeelen/.cache/huggingface/datasets/csv/default-36ee4eefb063a0df/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/kbeelen/.cache/huggingface/datasets/csv/default-36ee4eefb063a0df/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
dataset['meta']

["{'date': -3661459200000, 'location': 'Liverpool, Merseyside, England', 'ocr_quality_mean': 0.9283, 'title': 'The Northern Daily Times.', 'score': 0.4916961789}",
 "{'date': -3661286400000, 'location': 'Liverpool, Merseyside, England', 'ocr_quality_mean': 0.9762000000000001, 'title': 'The Northern Daily Times.', 'score': 0.4904969335}",
 "{'date': -3666988800000, 'location': 'Liverpool, Merseyside, England', 'ocr_quality_mean': 0.9552, 'title': 'The Northern Daily Times.', 'score': 0.48738962410000003}",
 "{'date': -3663792000000, 'location': 'Liverpool, Merseyside, England', 'ocr_quality_mean': 0.888, 'title': 'The Northern Daily Times.', 'score': 0.513350904}",
 "{'date': -3662496000000, 'location': 'Liverpool, Merseyside, England', 'ocr_quality_mean': 0.9436, 'title': 'The Northern Daily Times.', 'score': 0.48617035150000004}",
 "{'date': -3668630400000, 'location': 'Liverpool, Merseyside, England', 'ocr_quality_mean': 0.9444, 'title': 'The Northern Daily Times.', 'score': 0.521285

In [3]:
dataset = dataset['train']

## Classify Animacy

In [4]:
def pred_data(example,add_field='Date'):
    return {'st_year_sep': f'[{example[add_field]}]' + ' [SEP] ' + example['text'] ,
     'year_sep': str(example[add_field]) + ' [SEP] ' + example['text'] ,
     'year_date': str(example[add_field]) + ' [DATE] ' + example['text'] 
        
    }
    
dataset = dataset.map(pred_data , num_proc=6)

           

#0:   0%|          | 0/24 [00:00<?, ?ex/s]

KeyError: 'Date'

In [38]:
dataset['st_year_sep'][1]

'[1880] [SEP] Immured in a convent, debarred from life-giving air and light, and the beauty of life, we cease to be living, feeling, thinking girls and women, we become mere ***machines*** who blindly obey the head that directs us.'

In [25]:
len(dataset)

594

In [26]:
lab2code = {label:i for i,label in enumerate(dataset.unique('animacy'))}
num_labels = len(lab2code)
dataset = dataset.map(lambda x: {'label': lab2code[x['animacy']]})

  0%|          | 0/594 [00:00<?, ?ex/s]

In [27]:
dataset

Dataset({
    features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness', 'st_year_sep', 'year_sep', 'year_date', 'label'],
    num_rows: 594
})

In [28]:
test_size = int(len(dataset)*.2)
train_test = dataset.train_test_split(test_size=test_size , seed=42)
test_set = train_test['test']
val_size = int(len(train_test['train'])*.1)
train_val =  train_test['train'].train_test_split(test_size=val_size,seed=42))

In [29]:
checkpoints = [('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               ('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date')]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_clas

In [31]:
train_val

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 429
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Date', 'Sentence', 'SentenceCtxt', 'SentenceId', 'TargetExpression', 'animacy', 'humanness', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 47
    })
})

In [32]:
train_val = train_val.remove_columns(['Unnamed: 0','SentenceCtxt', 'SentenceId', 'TargetExpression','animacy', 'humanness'])
train_val

DatasetDict({
    train: Dataset({
        features: ['Date', 'Sentence', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 429
    })
    test: Dataset({
        features: ['Date', 'Sentence', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 47
    })
})

In [39]:
#def add_text_col(example,source):
#    return {'text' : example[source]}

def preprocess_function(examples, target_col):
    return tokenizer(examples[target_col], truncation=True)

In [44]:
result_dict = defaultdict(dict)

for name, mdict in model_dict.items():
    print(f'Creating a model for {name}')
    tokenizer = model_dict[name]['tokenizer']
    model = model_dict[name]['model']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    sent_col = model_dict[name]['sentences']
    
    #train_val = train_val.map(add_text_col,fn_kwargs={'source': sent_col})
    train_val = train_val.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    
    training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
        )


    trainer.train()
    
    
    test_set = test_set.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    predictions = trainer.predict(test_set)
    preds = np.argmax(predictions.predictions, axis=-1)
    result_dict[name]['f1_macro'] = f1_score(preds,predictions.label_ids,average='macro')
    result_dict[name]['f1_micro'] = f1_score(preds,predictions.label_ids,average='micro')
    result_dict[name]['accuracy']  = accuracy_score(preds,predictions.label_ids)

Creating a model for distilbert


  0%|          | 0/429 [00:00<?, ?ex/s]

  0%|          | 0/47 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_date, st_year_sep, year_sep, Date, Sentence. If year_date, st_year_sep, year_sep, Date, Sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/118 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0. If humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 118
  Batch size = 16


Creating a model for hmd_distilbert


  0%|          | 0/429 [00:00<?, ?ex/s]

  0%|          | 0/47 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_date, st_year_sep, year_sep, Date, Sentence. If year_date, st_year_sep, year_sep, Date, Sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/118 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0. If humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 118
  Batch size = 16


Creating a model for bnert-time-st-y


  0%|          | 0/429 [00:00<?, ?ex/s]

  0%|          | 0/47 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_date, st_year_sep, year_sep, Date, Sentence. If year_date, st_year_sep, year_sep, Date, Sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/118 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0. If humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 118
  Batch size = 16


Creating a model for bnert-time-y


  0%|          | 0/429 [00:00<?, ?ex/s]

  0%|          | 0/47 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_date, st_year_sep, year_sep, Date, Sentence. If year_date, st_year_sep, year_sep, Date, Sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/118 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0. If humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 118
  Batch size = 16


Creating a model for bnert-time-y_masked_25


  0%|          | 0/429 [00:00<?, ?ex/s]

  0%|          | 0/47 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_date, st_year_sep, year_sep, Date, Sentence. If year_date, st_year_sep, year_sep, Date, Sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/118 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0. If humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 118
  Batch size = 16


Creating a model for bnert-time-y_masked_75


  0%|          | 0/429 [00:00<?, ?ex/s]

  0%|          | 0/47 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: year_date, st_year_sep, year_sep, Date, Sentence. If year_date, st_year_sep, year_sep, Date, Sentence are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 429
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 81


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




  0%|          | 0/118 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0. If humanness, animacy, year_sep, SentenceId, st_year_sep, year_date, Date, Sentence, TargetExpression, SentenceCtxt, Unnamed: 0 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 118
  Batch size = 16


In [45]:
results_df = pd.DataFrame.from_dict(result_dict, orient='index')

In [46]:
print(results_df.round(3).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  f1\_macro &  f1\_micro &  accuracy \\
\midrule
distilbert             &     0.752 &     0.771 &     0.771 \\
hmd\_distilbert         &     0.705 &     0.737 &     0.737 \\
bnert-time-st-y        &     0.724 &     0.763 &     0.763 \\
bnert-time-y           &     0.724 &     0.754 &     0.754 \\
bnert-time-y\_masked\_25 &     0.713 &     0.746 &     0.746 \\
bnert-time-y\_masked\_75 &     0.735 &     0.763 &     0.763 \\
\bottomrule
\end{tabular}



  print(results_df.round(3).to_latex())


In [203]:
!mkdir tables

In [204]:
results_df.to_csv('tables/classsify_pol_wo_regex')