In [151]:
import torch
import seaborn as sns
import pandas as pd
import transformers
import evaluate
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from transformers import DataCollatorWithPadding
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LinearRegression
sns.set()

In [9]:
dataset = load_from_disk('/datadrive_2/frozen_corpus')
test_data = dataset['test']

In [13]:
test_data

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'length'],
    num_rows: 581857
})

## Classify by Political Leaning

In [10]:
pol_pattern = re.compile(r'\bliberal|\bconservat|\bministers?|\btory\b|\btories\b|\bgovernments?',re.I)

In [11]:
pol_pattern.findall('liberal governments do not fire their ministers')

['liberal', 'governments', 'ministers']

In [17]:
def sent_split(x):
     return {'data': [
                {'sentence':s.lower(),
                 'length': len(s.split()),
                 'pol': p, 'loc':l, 'year':y, 'ocr':o,'nlp':n} 
                     for y,p,l,o,n,t in zip(x['year'],x['pol'],x['loc'],x['ocr'],x['nlp'],x['sentences']) 
                      for s in sent_tokenize(t) if pol_pattern.findall(s)
                 ]
            }

test_data = test_data.map(sent_split,batched=True, remove_columns=test_data.column_names)

  0%|          | 0/582 [00:00<?, ?ba/s]

In [18]:
test_data = test_data.flatten()

In [20]:
test_data

Dataset({
    features: ['data.length', 'data.loc', 'data.nlp', 'data.ocr', 'data.pol', 'data.sentence', 'data.year'],
    num_rows: 65710
})

In [182]:
data = test_data.shuffle(seed=42).select(range(10000))

Loading cached shuffled indices for dataset at /datadrive_2/frozen_corpus/test/cache-b6ac1b4ffdd661cc.arrow


In [183]:
def pred_data(example,add_field='data.year'):
    return {'st_year_sep': f'[{example[add_field]}]' + ' [SEP] ' + example['data.sentence'] ,
     'year_sep': str(example[add_field]) + ' [SEP] ' + example['data.sentence'] ,
     'year_date': str(example[add_field]) + ' [DATE] ' + example['data.sentence'] 
        
    }
    
data = data.map(pred_data , num_proc=6)

       

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-e4d9b85ecddbe30c.arrow


 

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-800a8f5ff77e2b1f.arrow


 

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-7aabe4805d1a6c7d.arrow


 

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-ce31827302df7a19.arrow


  

Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-24e8b25c3f85ce18.arrow
Loading cached processed dataset at /datadrive_2/frozen_corpus/test/cache-1e1fc25b31fa37fe.arrow


In [184]:
len(data)

10000

In [185]:
lab2code = {label:i for i,label in enumerate(data.unique('data.pol'))}
num_labels = len(lab2code)
data = data.map(lambda x: {'label': lab2code[x['data.pol']]})

  0%|          | 0/10000 [00:00<?, ?ex/s]

In [186]:
data

Dataset({
    features: ['data.length', 'data.loc', 'data.nlp', 'data.ocr', 'data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
    num_rows: 10000
})

In [187]:
test_size = int(len(data)*.1)
train_test = data.train_test_split(test_size=test_size)
test_set = train_test['test']
val_size = int(len(train_test['train'])*.1)
train_val =  train_test['train'].train_test_split(test_size=val_size)

In [188]:
checkpoints = [('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               ('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date')]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/kbeelen/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weig

loading weights file /datadrive_2/bnert-time-st-y/pytorch_model.bin
Some weights of the model checkpoint at /datadrive_2/bnert-time-st-y were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /datadrive_2/bnert-time-st-y and a

loading file /datadrive_2/bnert-time-y_masked_75/vocab.txt
loading file /datadrive_2/bnert-time-y_masked_75/tokenizer.json
loading file /datadrive_2/bnert-time-y_masked_75/added_tokens.json
loading file /datadrive_2/bnert-time-y_masked_75/special_tokens_map.json
loading file /datadrive_2/bnert-time-y_masked_75/tokenizer_config.json


In [189]:
train_val = train_val.remove_columns(['data.nlp', 'data.ocr','data.length', 'data.loc'])
train_val

DatasetDict({
    train: Dataset({
        features: ['data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 8100
    })
    test: Dataset({
        features: ['data.pol', 'data.sentence', 'data.year', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 900
    })
})

In [190]:
#def add_text_col(example,source):
#    return {'text' : example[source]}

def preprocess_function(examples, target_col):
    return tokenizer(examples[target_col], truncation=True)

In [None]:
result_dict = defaultdict(dict)

for name, mdict in model_dict.items():
    print(f'Creating a model for {name}')
    tokenizer = model_dict[name]['tokenizer']
    model = model_dict[name]['model']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    sent_col = model_dict[name]['sentences']
    
    #train_val = train_val.map(add_text_col,fn_kwargs={'source': sent_col})
    train_val = train_val.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    
    training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
        )


    trainer.train()
    
    
    test_set = test_set.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    predictions = trainer.predict(test_set)
    preds = np.argmax(predictions.predictions, axis=-1)
    result_dict[name]['f1_macro'] = f1_score(preds,predictions.label_ids,average='macro')
    result_dict[name]['f1_micro'] = f1_score(preds,predictions.label_ids,average='micro')
    result_dict[name]['accuracy']  = accuracy_score(preds,predictions.label_ids)

Creating a model for distilbert


  0%|          | 0/8100 [00:00<?, ?ex/s]

  0%|          | 0/900 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: st_year_sep, year_sep, data.year, data.pol, data.sentence, year_date. If st_year_sep, year_sep, data.year, data.pol, data.sentence, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8100
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2535


Step,Training Loss
500,0.582
1000,0.4559
1500,0.3881
2000,0.3019
2500,0.2304


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

  0%|          | 0/1000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.ocr, st_year_sep, data.loc, year_sep, data.nlp, data.length, data.year, data.pol, data.sentence, year_date. If data.ocr, st_year_sep, data.loc, year_sep, data.nlp, data.length, data.year, data.pol, data.sentence, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


Creating a model for hmd_distilbert


  0%|          | 0/8100 [00:00<?, ?ex/s]

  0%|          | 0/900 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: st_year_sep, year_sep, data.year, data.pol, data.sentence, year_date. If st_year_sep, year_sep, data.year, data.pol, data.sentence, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8100
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2535


Step,Training Loss
500,0.5825
1000,0.4442
1500,0.3547
2000,0.2657
2500,0.2027


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

  0%|          | 0/1000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: data.ocr, st_year_sep, data.loc, year_sep, data.nlp, data.length, data.year, data.pol, data.sentence, year_date. If data.ocr, st_year_sep, data.loc, year_sep, data.nlp, data.length, data.year, data.pol, data.sentence, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


Creating a model for bnert-time-st-y


  0%|          | 0/8100 [00:00<?, ?ex/s]

  0%|          | 0/900 [00:00<?, ?ex/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: st_year_sep, year_sep, data.year, data.pol, data.sentence, year_date. If st_year_sep, year_sep, data.year, data.pol, data.sentence, year_date are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8100
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2535


Step,Training Loss
500,0.5725
1000,0.4478
1500,0.364
2000,0.2794


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-1500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1500/special_toke

In [None]:
results_df = pd.DataFrame.from_dict(result_dict, orient='index')

In [177]:
results_df

Unnamed: 0,f1_macro,f1_micro,accuracy
distilbert,0.253306,0.71,0.71
hmd_distilbert,0.253306,0.71,0.71
bnert-time-st-y,0.332185,0.77,0.77
bnert-time-y,0.326749,0.77,0.77
bnert-time-y_masked_25,0.312792,0.75,0.75
bnert-time-y_masked_75,0.327077,0.76,0.76
