<div style="font-variant: small-caps; 
      font-weight: normal; 
      font-size: 35px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Language Modeling
  </div> 
  
<div style="
      font-weight: normal; 
      font-size: 25px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
      Albert Small on custom task - Clinical Trials CTTI
  </div> 


  <div style="
      font-size: 15px; 
      line-height: 12px; 
      text-align: center; 
      padding: 15px; 
      margin: 10px;">
  Jean-baptiste AUJOGUE
  </div> 

  
  <div style=" float:right; 
      font-size: 12px; 
      line-height: 12px; 
  padding: 10px 15px 8px;">
  December 2022
  </div>

<a id="TOC"></a>

#### Table Of Content

1. [Dataset](#data) <br>
2. [ALBERT finetuning](#albert) <br>
3. [Inference](#inference) <br>



#### Reference

- Hugginface full list of [tutorial notebooks](https://github.com/huggingface/transformers/tree/main/notebooks) (see also [here](https://huggingface.co/docs/transformers/main/notebooks#pytorch-examples))
- Huggingface full list of [training scripts](https://github.com/huggingface/transformers/tree/main/examples/pytorch)
- Huggingface [tutorial notebook](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb) on language models
- Huggingface [course](https://huggingface.co/course/chapter7/3?fw=tf) on language models
- Huggingface [training script](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py) on language models
- Albert [original training protocol](https://github.com/google-research/albert)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
from itertools import chain

import pandas as pd
from datasets import (
    Dataset, 
    Features, 
    Value,
    load_from_disk,
)

import torch
import transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForMaskedLM, 
    TrainingArguments, 
    Trainer,
    pipeline,
    set_seed,
)

  from .autonotebook import tqdm as notebook_tqdm


#### Transformers settings

In [4]:
transformers.__version__

'4.22.2'

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
# make training deterministic
set_seed(42)

#### Custom paths & imports

In [7]:
path_to_repo = os.path.dirname(os.getcwd())
path_to_data = os.path.join(path_to_repo, 'datasets', 'clinical trials CTTI')
path_to_save = os.path.join(path_to_repo, 'models', 'mlm')
path_to_src  = os.path.join(path_to_repo, 'src')

In [8]:
sys.path.insert(0, path_to_src)

In [None]:
from bertools.tasks.mlm import CustomDataCollatorForLanguageModeling

#### Constants

In [9]:
dataset_name = 'clinical-trials-ctti'
base_model_name = "albert-small-ctti"
final_model_name = "albert-small-dlm-ctti"

<a id="data"></a>

# 1. Dataset

[Table of content](#TOC)

We generate a collection of instances of the `datasets.Dataset` class. 

Note that these are different from the fairly generic `torch.utils.data.Dataset` class. 

## 1.1 Load Clinical Trials corpus

[Table of content](#TOC)

In [None]:
def remove_empty_lines(examples):
    examples['text'] = [
        t for t in examples['text'] if len(t) > 0 and not t.isspace()
    ]
    return examples

In [12]:
df_trials = pd.read_csv(os.path.join(path_to_data, '{}.tsv'.format(dataset_name)), sep = "\t")
df_trials = df_trials.fillna('')
df_trials.shape

(430108, 7)

In [13]:
texts = df_trials[['Summary', 'Description', 'IE_criteria']].values.tolist()
texts = [t.strip() for ts in texts for t in ts if len(t.strip())>=50]
len(texts)

1041969

In [14]:
dataset = Dataset.from_dict({'text': texts}, features = Features({'text': Value(dtype = 'string')}))

In [None]:
dataset = dataset.map(remove_empty_lines, batched = True)

## 1.2 Build Clinical-Albert-small tokenizer

[Table of content](#TOC)


In [16]:
def batch_iterator(dataset, batch_size = 512):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i: i + batch_size]['text']

In [17]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path_to_save, base_model_name, 'tokenizer'))

## 1.3 Tokenize corpus

[Table of content](#TOC)


In [19]:
tokenized_dataset = dataset.map(
    lambda examples: tokenizer(examples["text"], return_special_tokens_mask = True), 
    batched = True, 
    remove_columns = ["text"],
)

By contrast to the generic BIO annotated data, this new data depends on the tokenizer, and is therefore _model-specific_.

_Note_: the argument `remove_columns = ["text"]` is mandatory, in order to have each item of the dataset have same length.

## 1.4 Form blocks of constant length

[Table of content](#TOC)


In [20]:
def group_texts(examples, block_size):
    # Concatenate all texts.
    keys = [k for k in examples.keys() if k != 'text']
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[keys[0]])
    
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i+block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [21]:
# mlm_dataset = tokenized_dataset.map(
#    lambda examples: group_texts(examples, block_size = 512), batched = True
# )
# mlm_dataset.save_to_disk(os.path.join(path_to_data, 'tmp', dataset_name))

In [26]:
lm_dataset = load_from_disk(os.path.join(path_to_data, 'tmp', dataset_name))
len(lm_dataset)

421077

In [1]:
# print(tokenizer.decode(lm_dataset[0]["input_ids"]), tokenizer.decode(lm_dataset[0]["labels"]))

In [2]:
# print(lm_dataset[0])

<a id="albert"></a>

# 2. ALBERT-small training

[Table of content](#TOC)

#### Tested combinations

- 1.4M parameter model: converges fast (1 epoch) towards confusion score~=2.2. Issue : Finetuning of NER on Chia hard, stuck to high training error and/or provides evaluation errors
- 3.5M parameter model: gets stuck at confusion score~=5.9. Training args : block_size = 512, bs = 16, lr = 1e-4, grad_acc_step = 4, warmup_step = 500, num_layer = 8

## 2.1 Build Clinical-Albert-small model

[Table of content](#TOC)

In [27]:
model = AutoModelForMaskedLM.from_pretrained(os.path.join(path_to_save, base_model_name, 'model'), device = device)

In [28]:
model.num_parameters()

2584584

## 2.2. Model training

[Table of content](#TOC)

`Albert-vase-v2` training parameters as provided in https://github.com/google-research/albert/blob/master/run_pretraining.py : 
- max_predictions_per_seq = `20`
- train_batch_size = `4096`
- optimizer = `"lamb"`
- learning_rate = `0.00176`
- poly_power = `1.0`
- num_train_steps = `125000`
- num_warmup_steps = `3125`
- start_warmup_step = `0`
- iterations_per_loop = `1000`

The original optimizer is `lamb`, which was designed for very large batch size, see the [Lamb paper](https://arxiv.org/pdf/1904.00962.pdf), but we use here the default [AdamW](https://huggingface.co/docs/transformers/main_classes/optimizer_schedules#transformers.AdamW) optimizer with [linear learning rate decay](https://huggingface.co/docs/transformers/v4.23.1/en/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup), as specified in the [Trainer class documentation](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.optimizers). See the [AdamW paper](https://arxiv.org/pdf/1711.05101.pdf).

In [35]:
batch_size = 16

In [36]:
model = model.train()

In [40]:
args = TrainingArguments(
    output_dir = os.path.join(path_to_save, '_checkpoints'),
    evaluation_strategy = "no",
    learning_rate = 5e-4,
    num_train_epochs = 1,
    warmup_steps = 1500,
    gradient_accumulation_steps = 1,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_strategy = 'no',
    logging_steps = 100,
    seed = 42,
    data_seed = 23,
)

In [84]:
trainer = Trainer(
    model,
    args,
    data_collator = CustomDataCollatorForLanguageModeling(
        tokenizer = tokenizer, 
        task_proportions = (2, 2, 2, 4),
    ),
    train_dataset = lm_dataset,
)

tensor([0.2000, 0.2000, 0.2000, 0.4000])


Some remarks:

- The `data_collator` is the object used to batch elements of the training & evaluation datasets.
- The `tokenizer` is provided in order to automatically pad the inputs to the maximum length when batching inputs, and to have it saved along the model, which makes it easier to rerun an interrupted training or reuse the fine-tuned model.

In [None]:
trainer.train()

In [24]:
model = model.to('cpu')

In [25]:
model.save_pretrained(os.path.join(path_to_save, final_model_name, 'model'))

Configuration saved in C:\Users\jb\Desktop\NLP\Internal - Transformers for NLP\saves\MLM\clinical-trials-albert-small\model\config.json
Model weights saved in C:\Users\jb\Desktop\NLP\Internal - Transformers for NLP\saves\MLM\clinical-trials-albert-small\model\pytorch_model.bin


<a id="inference"></a>

# 3. Inference

[Table of content](#TOC)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(os.path.join(path_to_save, final_model_name, 'tokenizer'))
model = AutoModelForMaskedLM.from_pretrained(os.path.join(path_to_save, final_model_name, 'model'))

In [39]:
mlm = pipeline(
    task = 'fill-mask', 
    model = model, 
    tokenizer = tokenizer,
    framework = 'pt',
)

In [40]:
sent = 'Polyneuropathy of other causes, including but not limited to hereditary demyelinating neuropathies, neuropathies secondary to infection or systemic disease, diabetic neuropathy, drug- or toxin-induced neuropathies, multifocal motor neuropathy, monoclonal gammopathy of uncertain significance, lumbosacral radiculoplexus neuropathy, pure sensory CIDP and acquired demyelinating symmetric (DADS) neuropathy (also known as distal CIDP).'
sent = f'Polyneuropathy of other causes, including but not limited to {mlm.tokenizer.mask_token} demyelinating neuropathies,  {mlm.tokenizer.mask_token} secondary to infection or systemic {mlm.tokenizer.mask_token}, diabetic neuropathy, drug- or toxin-induced neuropathies, multifocal motor {mlm.tokenizer.mask_token}, monoclonal gammopathy of uncertain significance, lumbosacral radiculoplexus neuropathy, pure sensory CIDP and acquired demyelinating symmetric (DADS) neuropathy (also known as distal CIDP).'
mlm(sent, top_k = 5)

[[{'score': 0.2646515965461731,
   'token': 6,
   'token_str': 'the',
   'sequence': '[CLS] polyneuropathy of other causes, including but not limited to the demyelinating neuropathies,[MASK] secondary to infection or systemic[MASK], diabetic neuropathy, drug- or toxin-induced neuropathies, multifocal motor[MASK], monoclonal gammopathy of uncertain significance, lumbosacral radiculoplexus neuropathy, pure sensory cidp and acquired demyelinating symmetric (dads) neuropathy (also known as distal cidp).[SEP]'},
  {'score': 0.07012559473514557,
   'token': 105,
   'token_str': 'other',
   'sequence': '[CLS] polyneuropathy of other causes, including but not limited to other demyelinating neuropathies,[MASK] secondary to infection or systemic[MASK], diabetic neuropathy, drug- or toxin-induced neuropathies, multifocal motor[MASK], monoclonal gammopathy of uncertain significance, lumbosacral radiculoplexus neuropathy, pure sensory cidp and acquired demyelinating symmetric (dads) neuropathy (als

[Table of content](#TOC)