### Finetune Albert with downstream tasks
#### Overview
This notebook illustrates finetuning the pretrained albert model on custom corpus, with an example of token classification task. 
The notebook uses transformers repo from Huggingface team and implements in Pytorch
#### Steps
1. Clone transformers repo from Huggingface team
2. Install required dependencies as per transformers repo
3. create this notebook in the transformers/src folder

### Note
Below notebook didnt specifically test on the toy data, but it was tested with a real application data, for a similar token classification purpose of the same input format. For your reference

In [None]:
# At the time of writing, Hugging face didnt provide the class object for 
# AlbertForTokenClassification, hence write your own defination below
from transformers.modeling_albert import AlbertModel, load_tf_weights_in_albert, AlbertPreTrainedModel
from transformers.configuration_albert import AlbertConfig
from transformers.tokenization_bert import BertTokenizer
import torch.nn as nn
from torch.nn import CrossEntropyLoss
class AlbertForTokenClassification(AlbertPreTrainedModel):

    def __init__(self, albert, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.albert = albert
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.albert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        return logits

Create tokenizer, load Albert model

In [None]:
VOCAB_FILE = "models_toy/vocab.txt" # This is the vocab file output from Build Vocab step
CONFIG_FILE = "models_toy/albert_config.json"
ALBERT_PRETRAIN_CHECKPOINT = "/models/model.ckpt" # This is the model checkpoint output from Albert Pretrain step
tokenizer = BertTokenizer(vocab_file=VOCAB_FILE)
config = AlbertConfig.from_json_file(CONFIG_FILE)
model = AlbertModel(config)
model = load_tf_weights_in_albert(model, config,ALBERT_PRETRAIN_CHECKPOINT)
# If the variables not able to be initialized are only for the MLM and sequence order prediction task
# Then the error could be ignored
# As that is not required for the AlbertForTokenClassification we are trying to build here 

Load input files and create input torch tensors, dataloaders

In [None]:
TRAIN_FILE = "data_toy/dish_name_train.csv"
EVAL_FILE = "data_toy/dish_name_val.csv"

import numpy as np
def label_sent(name_tokens, sent_tokens):
    label = []
    i = 0
    if len(name_tokens)>len(sent_tokens):
        label = np.zeros(len(sent_tokens))
    else:
        while i<len(sent_tokens):
            found_match = False
            if name_tokens[0] == sent_tokens[i]:       
                found_match = True
                for j in range(len(name_tokens)-1):
                    if ((i+j+1)>=len(sent_tokens)):
                        return label
                    if name_tokens[j+1] != sent_tokens[i+j+1]:
                        found_match = False
                if found_match:
                    label.extend(list(np.ones(len(name_tokens)).astype(int)))
                    i = i + len(name_tokens)
                else: 
                    label.extend([0])
                    i = i+ 1
            else:
                label.extend([0])
                i=i+1
    return label

import pandas as pd
df_data_train = pd.read_csv(TRAIN_FILE)
df_data_train['review_tokens'] = df_data_train.review.apply(tokenizer.tokenize)
df_data_train['dish_name_tokens'] = df_data_train.dish_name_tokens.apply(tokenizer.tokenize)
df_data_train['review_labels'] = df_data_train.apply(lambda row: label_sent(row['dish_name_tokens'] row['review_tokens']), axis=1)

df_data_val = pd.read_csv(EVAL_FILE)
df_data_val['review_tokens'] = df_data_val.review.apply(tokenizer.tokenize)
df_data_val['dish_name_tokens'] = df_data_val.dish_name_tokens.apply(tokenizer.tokenize)
df_data_val['review_labels'] = df_data_val.apply(lambda row: label_sent(row['dish_name_tokens'] row['review_tokens']), axis=1)


MAX_LEN = 50
BATCH_SIZE = 1
from keras.preprocessing.sequence import pad_sequences
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

tr_inputs = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in df_data_train['review_tokens']],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
tr_tags = pad_sequences(df_data_train['review_labels'],
                     maxlen=MAX_LEN, padding="post",
                     dtype="long", truncating="post")
# create the mask to ignore the padded elements in the sequences.
tr_masks = [[float(i>0) for i in ii] for ii in tr_inputs]
tr_inputs = torch.tensor(tr_inputs)
tr_tags = torch.tensor(tr_tags)
tr_masks = torch.tensor(tr_masks)
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)


val_inputs = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in df_data_val['review_tokens']],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
val_tags = pad_sequences(df_data_val['review_labels'],
                     maxlen=MAX_LEN, padding="post",
                     dtype="long", truncating="post")
# create the mask to ignore the padded elements in the sequences.
val_masks = [[float(i>0) for i in ii] for ii in val_inputs]
val_inputs = torch.tensor(val_inputs)
val_tags = torch.tensor(val_tags)
val_masks = torch.tensor(val_masks)
val_data = TensorDataset(val_inputs, val_masks, val_tags)
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

Create Albert for Token Classification model, initialized with albert pretrained weights

Create optimizers, FULL_FINETUNING for fine tuning with updation to albert pretrained weights as well, otherwise on updation to finetuning layer

In [None]:
model_tokenclassification = AlbertForTokenClassification(model, config)
from torch.optim import Adam
LEARNING_RATE = 0.000001
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model_tokenclassification.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model_tokenclassification.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=LEARNING_RATE)

Start Training and Evaluation

For every epoch, the results is saved in outputs folder, and it could resume from last saved epoch if training stopped and restarted.

The below training supports for Multi GPU if available

In [None]:
# from torch.utils.tensorboard import SummaryWriter
import time
import os.path
import torch.nn as nn
EPOCH = 5
MAX_GRAD_NORM = 1.0
ALBERT_FINETUNE_CHECKPOINT = "outputs/finetune_checkpoint_5epoch_50neg_1e-5lr"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model_tokenclassification = nn.DataParallel(model_tokenclassification)
model_tokenclassification.to(device)
if os.path.isfile(ALBERT_FINETUNE_CHECKPOINT):
    print(f"--- Load from checkpoint ---")
    checkpoint = torch.load(ALBERT_FINETUNE_CHECKPOINT)
    model_tokenclassification.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    train_losses = checkpoint['train_losses']
    train_acc = checkpoint['train_acc']
    val_losses = checkpoint['val_losses']
    val_acc = checkpoint['val_acc']
    
else:
    epoch = -1
    train_losses, train_acc, val_losses, val_acc = [], [], [], []
print(f"--- Resume/Start training ---")    
for i in range(epoch+1, EPOCH): 
    print(f"--- epoch: {i} ---")
    start_time = time.time()
    
    # TRAIN loop
    model_tokenclassification.train()
    tr_loss, tr_acc, nb_tr_steps = 0, 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        b_outputs = model_tokenclassification(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        
        loss_fct = CrossEntropyLoss()
        # Only keep active parts of the loss
        b_active_loss = b_input_mask.view(-1) == 1
        b_active_logits = b_outputs.view(-1, config.num_labels)[b_active_loss]
        b_active_labels = b_labels.view(-1)[b_active_loss]
        loss = loss_fct(b_active_logits, b_active_labels)
        acc = torch.mean((torch.max(b_active_logits.detach(),1)[1] == b_active_labels.detach()).float())
      
        train_losses.append(loss.detach().item())
        train_acc.append(acc)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        tr_acc += acc
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        # update parameters
        optimizer.step()
        model.zero_grad()

    # print train loss per epoch
    print(f"Train loss: {(tr_loss/nb_tr_steps)}")
    print(f"Train Accuracy: {(tr_acc/nb_tr_steps)}")
    print(f"Train Time: {(time.time()-start_time)/60} mins")

    # VALIDATION on validation set
    start_time = time.time()
    model_tokenclassification.eval()
    eval_loss, eval_acc = 0, 0
    nb_eval_steps = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            
            b_outputs = model_tokenclassification(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask, labels=b_labels)

            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            b_active_loss = b_input_mask.view(-1) == 1
            b_active_logits = b_outputs.view(-1, config.num_labels)[b_active_loss]
            b_active_labels = b_labels.view(-1)[b_active_loss]
            loss = loss_fct(b_active_logits, b_active_labels)
            acc = np.mean(np.argmax(b_active_logits.detach().cpu().numpy(), axis=1).flatten() == b_active_labels.detach().cpu().numpy().flatten())

        eval_loss += loss.mean().item()
        eval_acc += acc
        nb_eval_steps += 1    
    eval_loss = eval_loss/nb_eval_steps
    eval_acc = eval_acc/nb_eval_steps
    val_losses.append(eval_loss)
    val_acc.append(eval_acc)
    print(f"Validation loss: {eval_loss}")
    print(f"Validation Accuracy: {(eval_acc)}")
    print(f"Validation Time: {(time.time()-start_time)/60} mins")    
    
    
    print(f"--- Save to checkpoint ---")  
    torch.save({
        'epoch': i,
        'model_state_dict': model_tokenclassification.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'train_losses': train_losses,
        'train_acc': train_acc,
        'val_losses': val_losses,
        'val_acc': val_acc}
        , ALBERT_FINETUNE_CHECKPOINT)     

For prediction

In [None]:
def predict(texts):
    tokenized_texts = [tokenizer.tokenize(txt) for txt in texts]
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    attention_mask = [[float(i>0) for i in ii] for ii in input_ids]
    
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)

    dataset = TensorDataset(input_ids, attention_mask)
    datasampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=datasampler, batch_size=BATCH_SIZE) 
    
    predicted_labels = []
    
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask = batch
        
        with torch.no_grad():
            logits = model_tokenclassification(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)

            predicted_labels.append(np.multiply(np.argmax(logits.detach().cpu().numpy(),axis=2), b_input_mask.detach().cpu().numpy()))
    # np.concatenate(predicted_labels), to flatten list of arrays of batch_size * max_len into list of arrays of max_len
    return np.concatenate(predicted_labels).astype(int), tokenized_texts

texts = df_data_val.review.values
predicted_labels, _ = predict(texts)
df_data_val['predicted_review_label'] = list(predicted_labels)

def get_dish_candidate_names(predicted_label, tokenized_text):
    name_lists = []
    if len(np.where(predicted_label>0)[0])>0:
        name_idx_combined = np.where(predicted_label>0)[0]
        name_idxs = np.split(name_idx_combined, np.where(np.diff(name_idx_combined) != 1)[0]+1)
        name_lists.append([" ".join(np.take(tokenized_text,name_idx)) for name_idx in name_idxs])
        # If there duplicate names in the name_lists
        name_lists = np.unique(name_lists)
        return name_lists
    else:
        return None
df_data_val['candidate_name']=df_data_val.apply(lambda row: get_dish_candidate_names(row.predicted_review_label, row.review_tokens)
                                                , axis=1)