In [1]:
import torch
import seaborn as sns
import pandas as pd
import transformers
import evaluate
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from transformers import DataCollatorWithPadding
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LinearRegression
sns.set()

In [2]:
#dataset = load_from_disk('/datadrive_2/')
#test_data = dataset['test']

cache_dir = '/datadrive_2/hf_cache/'
dataset = load_from_disk("/datadrive_2/HMD_chunked_100_test/")
dataset #= dataset['train']

Dataset({
    features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr'],
    num_rows: 11315511
})

## Classify by Political Leaning

In [3]:
pol_pattern = re.compile(r'\bliberal|\bconservat|\btory\b|\btories\b',re.I)

In [4]:
pol_pattern.findall('liberal governments do not fire their conservative political ministers minister')

['liberal', 'conservat']

In [5]:
# def sent_split(x):
#      return {'data': [
#                 {'sentence':s.lower(),
#                  'length': len(s.split()),
#                  'pol': p, 'loc':l, 'year':y, 'ocr':o,'nlp':n} 
#                      for y,p,l,o,n,t in zip(x['year'],x['pol'],x['loc'],x['ocr_quality_mean'],x['nlp'],x['text']) 
#                       for s in sent_tokenize(t) 
#                          if pol_pattern.findall(s)
#                  ]
#             }

# test_data = dataset.map(sent_split,batched=True, remove_columns=dataset.column_names)

In [6]:
political_vocab = True
if political_vocab:
    test_data = dataset.map(lambda x: {'sentences': x['sentences'].lower()}, num_proc=6
                               ).filter(lambda x: len(pol_pattern.findall(x['sentences'])) > 0
                                   ).shuffle(seed=42).select(range(15000))
else:
    test_data = dataset.map(lambda x: {'sentences': x['sentences'].lower()}, num_proc=6).shuffle(seed=42).select(range(15000))

       

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-6f3d8b9a1021878e.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-b3061892df3a5bbe.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-2d79338e1133aae6.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-24d2a5525e368df0.arrow


  

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-06bf12bcec6754fc.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-a090791e087fa494.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-b005ec092fb2e6fc.arrow
Loading cached shuffled indices for dataset at /datadrive_2/HMD_chunked_100_test/cache-576b39d14e3bb9a9.arrow


In [7]:
test_data[11]

{'year': 1865,
 'nlp': 2194,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': 'not tell what kind of a man they werefighting. how the conservatives and extreme ra-dicals could unite in supporting mr. cobbett hadbeen a puzzle to him ever since he took a part in thepolitics of oldham. the time would soon arrivewhen they would have to think of politics, and ofthe promotion of liberal principles, by which hemeant principles in accordance with the legislationof the last 30 years. after the results they had seen,he did not think that any of them would like to goback. (hear.) opponents who had fought againstthat great and good man who had lately departedfrom them, were',
 'ocr': 0.9807}

In [8]:
#data = test_data.filter(lambda x: x['data.length'] > 25).shuffle(seed=42).select(range(15000))

In [9]:
def pred_data(example,add_field='year'):
    return {'st_year_sep': f'[{example[add_field]}]' + ' [SEP] ' + example['sentences'] ,
     'year_sep': str(example[add_field]) + ' [SEP] ' + example['sentences'] ,
     'year_date': str(example[add_field]) + ' [DATE] ' + example['sentences'],
        
    }
    
data = test_data.map(pred_data , num_proc=6)

        

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-e90d0f0b765edf88.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-2f0fd14b8ff167b5.arrow


  

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-fe8323efe8a187d2.arrow
Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-c9daafb7635b3c9d.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-78b5a10eeb251f17.arrow


 

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-8b198bca3e71b49c.arrow


In [10]:
len(data)

15000

In [11]:
lab2code = {'[con]':0,'[lib]':1,'[rad]':2,'[neutr]':3,'[none]':4}
num_labels = len(lab2code)
data = data.map(lambda x: {'label': lab2code[x['pol']]})

Loading cached processed dataset at /datadrive_2/HMD_chunked_100_test/cache-ebfbc9adbfcb7f5c.arrow


In [12]:
data[0]

{'year': 1839,
 'nlp': 2194,
 'pol': '[lib]',
 'loc': '[london]',
 'sentences': "such mea-sures, that triumphant majority which must carry themthrough every species of tory tribulatinn..—(cheer.s.)these are the views that i advocate—this theorpoislie.iytthat i would pursue. is it a dangerous.poliey,not recommended. by its moderation, by its good sense.?and if so, are we to support that government whichopposed to its recomnition or do you sanction hostility.to a government 'which 'will not adopt those.views,even though it may lead to a catastrophe of which theeffects are speculative ? i consider the prospect beforeus is bright, and that we have only to pursue a steadycourse of well doing, and that:your representatives oughtto enforce such sentiments as",
 'ocr': 0.9769,
 'st_year_sep': "[1839] [SEP] such mea-sures, that triumphant majority which must carry themthrough every species of tory tribulatinn..—(cheer.s.)these are the views that i advocate—this theorpoislie.iytthat i would purs

In [13]:
test_size = int(len(data)*.2)
train_test = data.train_test_split(test_size=test_size, seed=1984)
test_set = train_test['test']
val_size = int(len(train_test['train'])*.15)
train_val =  train_test['train'].train_test_split(test_size=val_size, seed=1984)

Loading cached split indices for dataset at /datadrive_2/HMD_chunked_100_test/cache-0183f1f5fa427ac2.arrow and /datadrive_2/HMD_chunked_100_test/cache-20456e9c23bc1a6c.arrow
Loading cached split indices for dataset at /datadrive_2/HMD_chunked_100_test/cache-ee6a50bc1b107305.arrow and /datadrive_2/HMD_chunked_100_test/cache-078c3d4a3ccd6457.arrow


In [14]:
train_val

DatasetDict({
    train: Dataset({
        features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 10200
    })
    test: Dataset({
        features: ['year', 'nlp', 'pol', 'loc', 'sentences', 'ocr', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 1800
    })
})

In [15]:
checkpoints = [#('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               #('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               #('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               #('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               #('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               #('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date'),
               #('bnert-pol-st','/datadrive_2/bnert-pol-st','[SEP]','year_sep'),
               #('bnert-pol','/datadrive_2/bnert-pol','[SEP]','year_sep')
                ('bnert-comb','/datadrive_2/bnert-combined','[POL]','year_sep')
              ]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

Some weights of the model checkpoint at /datadrive_2/bnert-combined were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /datadrive_2/bnert-combined and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pr

In [16]:
train_val = train_val.remove_columns(['nlp', 'ocr', 'loc'])
train_val

DatasetDict({
    train: Dataset({
        features: ['year', 'pol', 'sentences', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 10200
    })
    test: Dataset({
        features: ['year', 'pol', 'sentences', 'st_year_sep', 'year_sep', 'year_date', 'label'],
        num_rows: 1800
    })
})

In [17]:
#def add_text_col(example,source):
#    return {'text' : example[source]}

def preprocess_function(examples, target_col):
    return tokenizer(examples[target_col], truncation=True )

In [18]:
result_dict = defaultdict(dict)

for name, mdict in model_dict.items():
    print(f'Creating a model for {name}')
    tokenizer = model_dict[name]['tokenizer']
    model = model_dict[name]['model']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    sent_col = model_dict[name]['sentences']
    
    #train_val = train_val.map(add_text_col,fn_kwargs={'source': sent_col})
    train_val = train_val.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    
    training_args = TrainingArguments(
    seed=1984,
    output_dir=f"./results_{name}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
        )


    trainer.train()
    
    model.save_pretrained(f'/datadrive_2/{name}-pol')
    tokenizer.save_pretrained(f"/datadrive_2/{name}-pol")
    
    test_set = test_set.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    predictions = trainer.predict(test_set)
    preds = np.argmax(predictions.predictions, axis=-1)
    result_dict[name]['f1_macro'] = f1_score(preds,predictions.label_ids,average='macro')
    result_dict[name]['f1_micro'] = f1_score(preds,predictions.label_ids,average='micro')
    result_dict[name]['accuracy']  = accuracy_score(preds,predictions.label_ids)

Creating a model for bnert-comb


  0%|          | 0/10200 [00:00<?, ?ex/s]

  0%|          | 0/1800 [00:00<?, ?ex/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: st_year_sep, pol, year_sep, year_date, sentences, year. If st_year_sep, pol, year_sep, year_date, sentences, year are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10200
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3190


Step,Training Loss
500,0.5684
1000,0.377
1500,0.306
2000,0.2342
2500,0.1741
3000,0.1231


Saving model checkpoint to ./results_bnert-comb/checkpoint-500
Configuration saved in ./results_bnert-comb/checkpoint-500/config.json
Model weights saved in ./results_bnert-comb/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_bnert-comb/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_bnert-comb/checkpoint-500/special_tokens_map.json
Saving model checkpoint to ./results_bnert-comb/checkpoint-1000
Configuration saved in ./results_bnert-comb/checkpoint-1000/config.json
Model weights saved in ./results_bnert-comb/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_bnert-comb/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_bnert-comb/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_bnert-comb/checkpoint-1500
Configuration saved in ./results_bnert-comb/checkpoint-1500/config.json
Model weights saved in ./results_bnert-comb/checkpoint-1500/pytorch_model.bin
tok

  0%|          | 0/3000 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: st_year_sep, loc, pol, year_sep, ocr, sentences, year_date, nlp, year. If st_year_sep, loc, pol, year_sep, ocr, sentences, year_date, nlp, year are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


In [19]:
results_df = pd.DataFrame.from_dict(result_dict, orient='index')

In [20]:
print(results_df.round(3).to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  f1\_macro &  f1\_micro &  accuracy \\
\midrule
bnert-comb &     0.691 &      0.85 &      0.85 \\
\bottomrule
\end{tabular}



  print(results_df.round(3).to_latex())


In [21]:
results_df.to_csv('tables/classsify_pol_regex.csv')