In [1]:
import torch
import pandas as pd
import transformers
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from datasets import load_dataset,load_from_disk
from transformers import Trainer
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize

We set a directory for caching the dataset files, otherwise the `/home` directory will overflow.

In [2]:
cache_dir = '/datadrive_2/hf_cache/'

## Load Dataset and filter

Import the datasets from the HuggingFace hub.

In [None]:
dataset = load_dataset("davanstrien/hmd_newspapers", cache_dir=cache_dir)
dataset

Because some files seem to lack a date, we filter first to avoid errors later on.

In [None]:
dataset = dataset.filter(lambda x: (x['date'] is not None) \
                         or (x['ocr_quality_mean'] is not None), num_proc=6)
dataset

## Link Data

In this section of the notebook we focus on further enriching an contextualizing the data by adding the political leaning as a variable. We also add the NLP identifier (used by FMP) which can be used later for adding metadata.

We will map the political leaning to special tokens added to the tokenizer instance below.

In [None]:
# create a mapping from NLP to political leaning
# this is created semi-manually for this limited dataset
nlp2pol = {2083:'neutral',
 2084:'neutral',
 2085:'neutral',
 2088:'conservative',
 2089:'conservative',
 2090:'conservative',
 2194:'liberal',
 2244:'none',
 2642:'liberal',
 2643:'conservative', # not found
 2644:'conservative',
 2645:'conservative',
 2646:'none', # https://en.wikipedia.org/wiki/The_Star_(1788)
 2647:'radical', # https://www.britishnewspaperarchive.co.uk/titles/statesman-london
}

pol2code = {'none':'[none]','neutral':'[neutr]','conservative':'[con]','liberal':'[lib]','radical':'[rad]'}

loc2code = {'Liverpool, Merseyside, England':'[liverpool]', 'London, England':'[london]'}

Because the original data lacks an NLP identifier for now, we add it via the title. For this we first need to create a mapping between NLP and titles for each publication year.

In [None]:
import requests
from collections import defaultdict
nlp_dict = defaultdict(dict)
url = 'https://raw.githubusercontent.com/Living-with-machines/hmd_url_generator/main/HMD_title_urls.json'
data = requests.get(url).json()
for newspaper, year_dict in data.items():
    for year, info_dict in year_dict.items():
        if info_dict['fname'].endswith('zip'):
            nlp_dict[newspaper][int(year)] = int(info_dict['fname'].split('_')[1])
nlp_dict.keys()

To match the titles between the `dataset` and the `nlp_dict` we need just one more dictionary.

In [None]:
dtitle2title = {'The Northern Daily Times.':'The Northern Daily Times etc',
 'Northern Times.':'The Northern Daily Times etc',
 'The Daily Times.':'The Northern Daily Times etc',
 'The Liverpool Standard and General Commercial Advertiser.':'The Liverpool Standard etc',
 'The Liverpool Standard, and General Advertiser.':'The Liverpool Standard etc',
 'The Sun.':'The Sun',
 'Colored News.':'Colored News',
 'The Express.':'The Express',
 'The National Register.':'National Register.',
 'The Press.':'The Press.',
 'Star.':'The Star',
 'The Statesman.':'The Statesman'
}

## Prepare Chunked Dataset

The code below extends information on each article by
- splitting the date into year and month
- add an NLP identifier as well codes for location and political leaning

In [None]:
def add_cols(example):

    nlp = nlp_dict[dtitle2title[example['title']]][example['date'].year]
    
    return {'year': example['date'].year,
            'length': len(example['text'].split()),
            'month': example['date'].month,
            'nlp' : nlp,
            'pol': pol2code[nlp2pol[nlp]],
            'loc': loc2code[example['location']]
           }


dataset = dataset.map(add_cols , num_proc=6)


In [None]:
dataset.save_to_disk('/datadrive_2/HMD_context')

In [None]:
lengths = pd.Series(dataset['train']['length'])
lengths.plot(kind='hist',bins=100)

In [None]:
sum(lengths > 100) / len(lengths)

In [None]:
dataset_long = dataset.filter(lambda x: x['length'] >= 100, num_proc=6)

In [None]:
len(dataset['train']),len(dataset_long['train'])

After replacing the columns with codes we can remove some information from the dataset.

In [None]:
dataset_long = dataset_long['train'].remove_columns(['title', 'location', 'date'])

In [None]:
#dataset_small = dataset.shuffle(seed=0).select(range(100_000))

In [None]:
dataset_long 

In [None]:
#include_cols = []

# def sent_split(x):
#      return {'data': [
#                 {'sentence':s.lower(),
#                  'length': len(s.split()),
#                  'pol': p, 'loc':l, 'year':y, 'ocr':o,'nlp':n} 
#                      for y,p,l,o,n,t in zip(x['year'],x['pol'],x['loc'],x['ocr_quality_mean'],x['nlp'],x['text']) 
#                       for s in sent_tokenize(t)]
            
            
#             }
     
def batched_prepare(examples, chunk_size=100):
    sentList, polList, locList, ocrList, yearList, nlpList = [],[],[],[],[],[]
    
    for text, pol, loc, ocr, year, nlp in zip(examples['text'],
                                         examples['pol'],
                                         examples['loc'],
                                         examples['ocr_quality_mean'],
                                         examples['year'],
                                         examples['nlp']):
        text = text.split()
        chunks = [' '.join(text[i:i+chunk_size]) for i in range(0,len(text),chunk_size)]
        
        sentList.extend(chunks)
        #lengthList.extend([len(s.split()) for s in sentences])
        polList.extend([pol]*len(chunks))
        locList.extend([loc]*len(chunks))
        ocrList.extend([ocr]*len(chunks))
        yearList.extend([year]*len(chunks))
        nlpList.extend([nlp]*len(chunks))
        
    return {"sentences": sentList, "loc": locList, "pol": polList,
             "ocr": ocrList, "year": yearList, 'nlp': nlpList, 
           }

            
chunked_corpus = dataset_long.map(batched_prepare, 
                              batched=True, 
                              num_proc=12,
                              remove_columns=dataset_long.column_names
                             )


In [None]:
chunked_corpus[30002]

In [None]:
len(chunked_corpus),len(dataset['train'])

In [None]:
# sent_corpus_flat = sent_corpus.flatten()
# sent_corpus_flat

In [None]:
chunked_corpus_split = chunked_corpus.train_test_split(
                        test_size=.1, seed=42)


In [None]:
chunked_corpus_split.save_to_disk('/datadrive_2/HMD_chunked_100_context')

### End of Dataset preparation

## Add special tokens to tokenizer

In [None]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
chunked_corpus_split = load_from_disk('/datadrive_2/HMD_chunked_100_context'); chunked_corpus_split

In [None]:
# #special year tokens
# added_tokens = []
# for mf in ['pol','loc']:
#     added_tokens.extend(chunked_corpus_split['train'].unique(mf))
    
        
# added_tokens.extend([f"[{e}]" for e in chunked_corpus_split['train'].unique('year')])
# tokenizer.add_tokens(added_tokens, )

# special_tokens =['[MET]','[DATE]','[POL]','[LOC]']
# metadata_tokens = {f'additional_special_tokens': special_tokens}
# tokenizer.add_special_tokens(metadata_tokens)

In [None]:
# no special year tokens
special_tokens =['[DATE]','[POL]','[LOC]']
metadata_tokens = {f'additional_special_tokens': special_tokens}
tokenizer.add_special_tokens(metadata_tokens)

In [None]:
tokenizer.save_pretrained("distilbert-hmd-uncased-combined")

In [None]:
chunked_corpus_split

## Further data processing

In [None]:
train_val_split = chunked_corpus_split['train'].train_test_split(test_size=.65, seed=42)

In [None]:
#train_val_split['test'].save_to_disk('/datadrive_2/HMD_chunked_100_test')

In [None]:

train_val_split = train_val_split['train'].train_test_split(test_size=.1, seed=42)

In [None]:
train_val_split

In [None]:
train_val_split = train_val_split.map(lambda example: {'length': [len(x.split()) for x in example['sentences']]}, 
                                        batched=True, num_proc=12)

In [None]:
train_val_split = train_val_split.filter(lambda x: x['length'] > 50, num_proc=6); train_val_split

In [None]:
sum(train_val_split['train']['length'])

In [None]:
train_val_split.save_to_disk('/datadrive_2/frozen_corpus')

# Change Preprocessing/Training From Here

In [None]:
train_val_split = load_from_disk('/datadrive_2/frozen_corpus')
tokenizer = AutoTokenizer.from_pretrained("distilbert-hmd-uncased-combined")

In [None]:
len(tokenizer)

In [None]:
train_val_split

In [None]:
def prepend_year_as_special_token(examples):
    return {'sent_context': [f'[{y}] [SEP] {s}' for y,s in zip(examples['year'],examples['sentences'])]}

#def prepend_pol_as_special_token(examples):
#    return {'sent_context': [f'{p} [SEP] {s}' for p,s in zip(examples['pol'],examples['sentences'])]}

def prepend_year(examples):
    return {'sent_context': [f'{y} [DATE] {s}' for y,s in zip(examples['year'],examples['sentences'])]}

def no_prepend(examples):
    return {'sent_context': [s for s in examples['sentences']]}

code2pol = {'[none]':'none','[neutr]':'neutral','[con]':'conservative','[lib]':'liberal','[rad]':'radical'}
code2loc = {'[london]':'london','[liverpool]':'liverpool'}

def prepend_combined(examples):
    return {'sent_context': [f'{y} [DATE] {code2pol[p]} [POL] {code2loc[l]} [LOC] {s}' for y,p,l,s in zip(examples['year'],
                                                              examples['pol'],
                                                              examples['loc'],
                                                              examples['sentences'])]}


training_corpus = train_val_split.map(prepend_combined, batched=True, num_proc=6)

In [None]:
training_corpus

In [None]:
print(tokenizer.tokenize(training_corpus['train'][10]['sent_context']))

In [None]:
def tokenize_sents(examples):
    return tokenizer(examples['sent_context'],
             truncation=True,
             padding='max_length',
             max_length=256,
             return_overflowing_tokens=False)
    

tokenized_contextualized = training_corpus.map(tokenize_sents,
                                                   batched=True,
                                                   num_proc=12,
                                                   remove_columns=training_corpus['train'].column_names)


In [None]:
tokenizer.decode(tokenized_contextualized['train'][10]['input_ids'])

In [None]:
tokenizer

In [None]:

tokenized_contextualized = tokenized_contextualized.map(
                lambda x: {'labels': [y.copy() for y in x['input_ids']]},
                batched=True,
                num_proc=12)


In [None]:
tokenized_contextualized

In [None]:
tokenized_contextualized.save_to_disk('/datadrive_2/frozen_backup')

# Training

In [3]:
tokenized_contextualized = load_from_disk('/datadrive_2/frozen_backup')
tokenizer = AutoTokenizer.from_pretrained("distilbert-hmd-uncased-combined")

In [4]:
tokenized_contextualized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5234550
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 581857
    })
})

In [5]:

#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=.15)

In [6]:
from transformers import default_data_collator
def masking_metadata_collator(features):
    #print(features)
    for feature in features:
        
        input_ids = np.array(feature["input_ids"])
        no_pad = input_ids[np.where(input_ids > 0)[0]]
        labels = np.array(feature["labels"])
        new_labels = np.array([-100]*len(no_pad))
        mask = np.random.binomial(1, .15, (len(no_pad),))
        mask[1] = np.random.binomial(1, .75, size=1)
        idxs = np.where(mask)
        input_ids[idxs] = tokenizer.mask_token_id
        
        feature['input_ids'] = input_ids
        
    return default_data_collator(features)

def masking_combined_metadata_collator(features):
    #print(features)
    for feature in features:
        
        input_ids = np.array(feature["input_ids"])
        no_pad = input_ids[np.where(input_ids > 0)[0]]
        labels = np.array(feature["labels"])
        new_labels = np.array([-100]*len(no_pad))
        mask = np.random.binomial(1, .15, (len(no_pad),))
        mask[1] = np.random.binomial(1, .25, size=1)
        mask[3] = np.random.binomial(1, .25, size=1)
        mask[5] = np.random.binomial(1, .25, size=1)
        idxs = np.where(mask)
        input_ids[idxs] = tokenizer.mask_token_id
        
        feature['input_ids'] = input_ids
        
    return default_data_collator(features)

In [7]:
tokenizer.decode(tokenized_contextualized['train'][1000]['input_ids'])

'[CLS] 1850 [DATE] liberal [POL] london [LOC] of unusual importance, that itaffected in a great degree the future peace of thisem - pire, and he bored it would receive from the! lousetheconsideration it deserved. sir george grey was not in the least disposedto treat with indifference the question b fore the! louse ; nor did he think the house would ho indifferentto any practical measure on the subject. — ( hear, hear. ) the bill now before the house contemplatedthe extension of the summary jurisdiction possessed bymagistrates with regard to juvenile offenders. thatwas nothing more nor less then to leave the jaw pre - cisely es it was under the juvenile offendersact, introduced by his honourable friend themember ler [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

In [8]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
model.resize_token_embeddings(len(tokenizer))

Embedding(30525, 768)

In [9]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(tokenized_contextualized["train"]) // batch_size
print(logging_steps)
model_name = model_checkpoint.split("/")[-1]

81789


In [10]:
model_name

'distilbert-base-uncased'

In [11]:
training_args = TrainingArguments(
    num_train_epochs=1,
    dataloader_drop_last=True,
    output_dir=f"/datadrive_2/{model_name}-combined-finetuned",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=10000,
    #logging_steps=2000,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps=logging_steps,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_contextualized["train"],
    eval_dataset=tokenized_contextualized["test"],
    data_collator=masking_combined_metadata_collator,
)

Using cuda_amp half precision backend


In [14]:
trainer.train()

***** Running training *****
  Num examples = 5234550
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 81789
  batch[k] = torch.tensor([f[k] for f in features])


Epoch,Training Loss,Validation Loss
1,0.3415,0.313347


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

TrainOutput(global_step=81789, training_loss=0.34152094108009634, metrics={'train_runtime': 92757.5027, 'train_samples_per_second': 56.433, 'train_steps_per_second': 0.882, 'total_flos': 3.469454447905014e+17, 'train_loss': 0.34152094108009634, 'epoch': 1.0})

In [15]:
model.save_pretrained('/datadrive_2/bnert-combined')
tokenizer.save_pretrained("/datadrive_2/bnert-combined")

Configuration saved in /datadrive_2/bnert-combined/config.json
Model weights saved in /datadrive_2/bnert-combined/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-combined/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-combined/special_tokens_map.json


('/datadrive_2/bnert-combined/tokenizer_config.json',
 '/datadrive_2/bnert-combined/special_tokens_map.json',
 '/datadrive_2/bnert-combined/vocab.txt',
 '/datadrive_2/bnert-combined/added_tokens.json',
 '/datadrive_2/bnert-combined/tokenizer.json')

In [16]:
model.save_pretrained('/datadrive_2/bnert-combined_backup')
tokenizer.save_pretrained("/datadrive_2/bnert-combined_backup")

Configuration saved in /datadrive_2/bnert-combined_backup/config.json
Model weights saved in /datadrive_2/bnert-combined_backup/pytorch_model.bin
tokenizer config file saved in /datadrive_2/bnert-combined_backup/tokenizer_config.json
Special tokens file saved in /datadrive_2/bnert-combined_backup/special_tokens_map.json


('/datadrive_2/bnert-combined_backup/tokenizer_config.json',
 '/datadrive_2/bnert-combined_backup/special_tokens_map.json',
 '/datadrive_2/bnert-combined_backup/vocab.txt',
 '/datadrive_2/bnert-combined_backup/added_tokens.json',
 '/datadrive_2/bnert-combined_backup/tokenizer.json')

# Fin.

In [None]:
# #samples= [tokenized_contextualized_small[:3]]
# samples= [tokenized_contextualized_split['train'][i] for i in range(10)]
# batch = data_collator(samples)

# for chunk in batch["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [None]:
model.save_pretrained('/datadrive_2/bnert-time-y-backup')
tokenizer.save_pretrained("/datadrive_2/bnert-time-y-backup")

In [None]:
#tokenizer.save_pretrained("/datadrive_2/bnert_time")

In [None]:
model_new = AutoModelForMaskedLM.from_pretrained('bnert_2')

In [None]:
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask", model="/datadrive_2/test"
)

In [None]:
mask_filler("[CLS] [MASK] [SEP] confident it will meet with that warm and generalsupport which its just and sound policy alike demand. by thus according to the people of ireland")

In [None]:
sent_contextualized['train'][200000]['sentences']

In [None]:
text = '1820 [SEP] 2 [SEP] 0 [SEP] the Prime Minister is Mr. [MASK].'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")


In [None]:
text = '1820 [SEP] 3 [SEP] 0 [SEP] there is plenty of [MASK] machines.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")
#preds

In [None]:
text = '1820 [SEP] 6 [SEP] 14 [SEP] He was involved in a [MASK] accident.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
preds

In [None]:
'1820 6 14 he was involved in a serious accident.'

In [None]:
text = '1810 [SEP] 2 [SEP] 1 [SEP] [MASK] Majesty.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
text = '[MASK] [SEP] 2 [SEP] 1 [SEP] The war between Denmark and Germany took a deadly toll.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
text = '[MASK] [SEP] 6 [SEP] 14 [SEP] The war between France and Germany took a deadly toll.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
text = '1880 [SEP] 6 [SEP] 14 [SEP] The war in [MASK] took a deadly toll.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
text = '1880 [SEP] 2 [SEP] 14 [SEP] The revolution in [MASK].'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
text = '1880 [SEP] [MASK] [SEP] 1 [SEP] liberal progress opinion liberal progress opinion liberal progress opinion.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
pol2id

In [None]:
text = '1870 [SEP] 0 [SEP] 0 [SEP] The train is leaving in [MASK].'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

In [None]:
pol2id, loc2id

In [None]:
text = '1830 [SEP] 0 [SEP] 1 [SEP] The train is heading towards [MASK].'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence'].upper()}")

In [None]:
text = '1830 [SEP] 0 [SEP] [MASK] [SEP] This paper is published in Manchester.'
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence'].upper()}")

In [None]:
inputs = tokenizer('1830 [SEP] 0 [SEP] 1 [SEP] The train is heading towards [MASK].', return_tensors="pt")
outputs = model(**inputs.to('cuda'))

outputs.logits.shape

In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
tmodel = AutoModelForTokenClassification.from_pretrained('bnert_2')

In [None]:
tmodel.to('cuda')

In [None]:
inputs = tokenizer('1830 [SEP] 0 [SEP] 1 [SEP] The train is heading towards [MASK].', return_tensors="pt")
outputs = tmodel(**inputs.to('cuda'))

outputs.logits.shape

In [None]:


fpipe = pipeline('feature-extraction',model='bnert_2')

In [None]:
features1 = fpipe('1880 [SEP] 0 [SEP] 1 [SEP] The train is heading towards london.')
pen1 = features1[0][-2]

In [None]:
features2 = fpipe('1820 [SEP] 0 [SEP] 0 [SEP] The train is heading towards london.')
pen2 = features2[0][-2]

In [None]:
from scipy.spatial.distance import cosine
cosine(pen1,pen2)

## To revisit later

In [None]:
from transformers import default_data_collator
def masking_metadata_collator(features):
    #print(features)
    for feature in features:
        
        input_ids = np.array(feature["input_ids"])
        no_pad = input_ids[np.where(input_ids > 0)[0]]
        labels = np.array(feature["labels"])
        new_labels = np.array([-100]*len(no_pad))
        mask = np.random.binomial(1, .15, (len(no_pad),))
        mask[1] = np.random.binomial(1, .75, size=1)
        idxs = np.where(mask)
        input_ids[idxs] = tokenizer.mask_token_id
        
        feature['input_ids'] = input_ids
        
    return default_data_collator(features)


#samples= [tokenized_contextualized_small[:3]]
samples= [tokenized_contextualized['train'][i] for i in range(10)]
batch = masking_sentence(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")