## Augmenting charater level embedding to BERT

**Char CNN already exists. Why is this special?**
- BERT is not fully dealing with OOV(especially nonsensical ones; ones caused by typo), and character level embedding can be an effective method to do so.
- May also be used for typo detection/fixing.
- The masking method to train BERT was never used on character level embedding.

In [None]:
from torch.utils.data import DataLoader
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd

In [None]:
# Vocab
id_to_char = ['<PAD>', '<CLS>', '<SEP>', '<MASK>', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', \
              '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
char_to_id = {c:i for i,c in enumerate(id_to_char)}

# Load preprocessed data (CBERT data preprocessing.ipynb)
# Contains: input_ids, encoder_mask, word_idx, num_words, num_chars
charYelp = pd.read_pickle("./dataset/charYelp_train")
input_ids = [x for x in torch.tensor(charYelp["input_ids"].values.tolist())]
encoder_mask = [x for x in torch.tensor(charYelp["encoder_mask"].values.tolist())]

charYelp_val = pd.read_pickle("./dataset/charYelp_validation")
#charYelp_val = pd.read_pickle("./dataset/charYelp_validation_medium")
#charYelp_val = pd.read_pickle("./dataset/charYelp_validation_large")
input_ids_val = [x for x in torch.tensor(charYelp_val["input_ids"].values.tolist())]
encoder_mask_val = [x for x in torch.tensor(charYelp_val["encoder_mask"].values.tolist())]

In [None]:
# Create dataloader
yelp_dataframe = list(zip(input_ids, encoder_mask))
train_iter = DataLoader(yelp_dataframe, batch_size=48, shuffle=True, num_workers=4)
yelp_val_dataframe = list(zip(input_ids_val, encoder_mask_val))
val_iter = DataLoader(yelp_val_dataframe, batch_size=48, shuffle=True, num_workers=4)

In [None]:
num_spechar = 4
vocab_size = len(id_to_char)-num_spechar
data_max_len = 256
from torch.nn import ModuleList
import copy

class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, pad_idx, max_seq_len, drop_prob):
        super(EmbeddingLayer, self).__init__()
        self.max_seq_len = max_seq_len
        self.char_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.position_embedding = nn.Embedding(max_seq_len, embed_size)
        self.LayerNorm = nn.LayerNorm(embed_size, eps=1e-7)
        self.dropout = nn.Dropout(drop_prob)  # 0.1


    def forward(self, input_ids):
        position_ids = torch.arange(self.max_seq_len, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        words_embeddings = self.char_embedding(input_ids)
        position_embeddings = self.position_embedding(position_ids)
        
        embeddings = words_embeddings + position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings  # (batchSize, sequenceLength, hidden_size)

class CBERT(nn.Module):
    def __init__(self, vocab_size, embed_size, dim_feedforward, num_heads, num_layers, pad_idx):
        super(CBERT, self).__init__()
        self.embedding_layer = EmbeddingLayer(vocab_size, embed_size, pad_idx, data_max_len, 0.1)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads, dim_feedforward=dim_feedforward, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.prediction_layer = nn.Linear(embed_size, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=2)
    
    def forward(self, x, mask):
        embedded_x = self.embedding_layer(x)
        encoded_x = self.transformer_encoder(embedded_x, src_key_padding_mask=mask)
        out = self.prediction_layer(encoded_x)
        output = self.log_softmax(out)
        return output

def generateData(iid, mask, device='cpu'):
    input_ids = torch.tensor(iid, device=device) # input_ids is the generated data; iid becomes target
    rand = torch.rand(iid.shape, device=device)
    # MASK
    mask_mask = rand < 0.12 # 0.15*0.8
    input_ids[mask_mask] = char_to_id['<MASK>']
    # SWAP
    swap_mask = rand > 1-0.015 # 0.15*0.1
    swap_char = torch.floor(torch.rand(iid.shape) * vocab_size + num_spechar).to(dtype=torch.long, device=device) # e.g. [0,1) -> [0~3) -> [4~7) -> 4,5,6
    input_ids[swap_mask] = swap_char[swap_mask] 
    # NOTHING (but predict)
    same_mask = (rand>=0.12) * (rand<=0.12+0.015)

    used_mask = mask_mask + swap_mask + same_mask
    # Prevention of zero targets, which leads to nan loss
    if torch.sum(used_mask*(mask==0))==0: 
        same_mask[0,1] = True
        used_mask[0,1] = True
        
    # padding all unmasked words on target; loss function ignores <PAD>.
    iid[torch.logical_not(used_mask)] = char_to_id['<PAD>']
    
    return (input_ids, iid)

In [None]:
def train(model, optimizer, loss_f, train_iter, num_epochs, device='cpu', prnt_intv=1):
    model = model.to(device=device)
    for epoch in range(num_epochs):
        train_loss_sum = torch.tensor([0.0], device=device)
        train_acc_sum = torch.tensor([0.0], device=device)
        num_seq = 0
        num_pred = 0
        for iid, mask in train_iter:
            iid, mask = iid.to(device=device), mask.to(device=device)
            model.train()
            optimizer.zero_grad()
            x, y = generateData(iid, mask, device)
            y_hat = model(x, mask)
            y_hat = y_hat.permute([0,2,1])
            
            loss = loss_f(y_hat, y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1e-5)
            optimizer.step()
            
            #if not torch.isfinite(loss):
            #    print(x, y, iid, mask)
            with torch.no_grad():
                train_loss_sum += loss.float()
                pred = torch.argmax(y_hat, dim=1)
                pred[y==0]=-1 # These are not predicted by model
                train_acc_sum += torch.sum(pred==y)
                num_seq += iid.shape[0]
                num_pred += torch.sum(y!=0)
                
            # Debugging purposes
            if False:
                print(loss)
                for i,p,t in zip(x, pred, y):
                    print("INPUT:", end=' ')
                    print_decoded_ids(i)
                    print("TARGET:", end=' ')
                    print_decoded_ids(t)
                    print("PREDICTION:", end=' ')
                    print_decoded_ids(p)
                    
        if (epoch+1)%prnt_intv == 0:
            print("Epoch:%d Loss:%f, TrainAcc:%f"%(epoch+1,train_loss_sum/num_seq,train_acc_sum/num_pred))

def evaluate(model, loss_f, val_iter, device='cpu'):
    model = model.to(device=device)
    model.eval()
    with torch.no_grad():
        loss_sum = torch.tensor([0.0], device=device)
        acc_sum = torch.tensor([0.0], device=device)
        num_seq = 0
        num_pred = 0
        
        for iid, mask in val_iter:
            iid, mask = iid.to(device=device), mask.to(device=device)
            x, y = generateData(iid, mask, device)
            y_hat = model(x, mask)
            y_hat = y_hat.permute([0,2,1])
            
            loss = loss_f(y_hat, y)
            
            loss_sum += loss.float()
            pred = torch.argmax(y_hat, dim=1)
            pred[y==0]=-1 # These are not predicted by model
            acc_sum += torch.sum(pred==y)
            num_seq += iid.shape[0]
            num_pred += torch.sum(y!=0)
            
        print("Val_Loss:%f, Val_Acc:%f"%(loss_sum/num_seq, acc_sum/num_pred))
            
def print_decoded_ids(ids):
    for c in ids:
        if c==0: # <PAD>, <SEP>
            print('_', end='')
        else:
            print(id_to_char[c], end='')
    print()
    
def show_sample(model, train_iter, loss_f, device='cpu', test_iter=None):
    model.eval()
    with torch.no_grad():
        for iid, mask in train_iter:
            iid, mask = iid.to(device=device), mask.to(device=device)
            x, y = generateData(iid, mask, device)
            y_hat = model(x, mask)
            y_hat = y_hat.permute([0,2,1])
            pred = torch.argmax(y_hat, dim=1)
            loss = loss_f(y_hat, y)
            
            for i,p,t in zip(x, pred, y):
                print("INPUT:", end=' ')
                print_decoded_ids(i)
                print("TARGET:", end=' ')
                print_decoded_ids(t)
                print("PREDICTION:", end=' ')
                print_decoded_ids(p)
            print("loss:", loss)
            pred = torch.argmax(y_hat, dim=1)
            pred[y==0]=-1
            print("acc:",torch.sum(pred==y)/torch.sum(y!=0))
            
            break
            
def init_weights(layer):
    if hasattr(layer, 'weight') and layer.weight.dim()>1:
        torch.nn.init.xavier_uniform_(layer.weight)

In [None]:
######################
# Hyperparameter log #
######################
#
# Data: Yelp[1:2]
# Model: embed384, ff1024, heads12, layers6
# Hyperparam: lr1e-5, clip1e-5
# Epochs to reach first 1.0: 1810 / around 1400
#
# Data: Yelp[1:2] (same masking)
# Model: embed64, ff256, heads8, layers6
# Hyperparam: lr1e-5, clip1e-5
# Epochs to reach first 1.0: 860
#
# Data: Yelp[1:2]
# Model: embed64, ff256, heads8, layers6
# Hyperparam: lr1e-5, clip1e-5
# Epochs to reach first 1.0: failed
#
# Data: Yelp[1:2]
# Model: embed128, ff512, heads8, layers6
# Hyperparam: lr1e-5, clip1e-5
# Epochs to reach first 1.0: more than 2000
#
# Data: Yelp[1:2]
# Model: embed64, ff256, heads8, layers12
# Hyperparam: lr1e-5, clip1e-5
# Epochs to reach first 1.0: failed
#
# Data: Yelp[1:2]
# Model: embed768, ff2048, heads12, layers6
# Hyperparam: lr1e-5, clip1e-5
# Epochs to reach first 1.0: 

In [None]:
model = CBERT(vocab_size=len(id_to_char),
                 embed_size=768,
                 dim_feedforward=2048,
                 num_heads=12,
                 num_layers=6,
                 pad_idx=char_to_id['<PAD>'])
model.apply(init_weights)
model = model.to(device='cuda')

In [None]:
# Load checkpoint
LOADPATH = "./models/yelp6_cp12.pt"
checkpoint = torch.load(LOADPATH)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device='cuda')

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay = 0.0)
loss_f = nn.NLLLoss(ignore_index=char_to_id['<PAD>'])
train(model, optimizer, loss_f, train_iter, num_epochs=1, device='cuda', prnt_intv=1)
evaluate(model, loss_f, val_iter, device='cuda')

In [None]:
for i in range(50):
    train(model, optimizer, loss_f, train_iter, num_epochs=1, device='cuda', prnt_intv=1)
    show_sample(model, train_iter, loss_f, device='cuda')

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay = 0.0)
loss_f = nn.NLLLoss(ignore_index=char_to_id['<PAD>'])
train(model, optimizer, loss_f, train_iter, num_epochs=50, device='cuda', prnt_intv=1)

In [None]:
# train with model save & learning rate schedule
lr = 5e-5
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay = 0.0)
loss_f = nn.NLLLoss(ignore_index=char_to_id['<PAD>'])
for i in range(100):
    print("Training iter {0}, lr {1}".format(i,lr))
    train(model, optimizer, loss_f, train_iter, num_epochs=5, device='cuda', prnt_intv=1)
    # Save model
    PATH = "./models/sample_cp" + str(i).zfill(2) + ".pt"
    torch.save({
        'model_state_dict' : model.state_dict(),
    }, PATH)
    evaluate(model, loss_f, val_iter, device='cuda')

In [None]:
PATH = "./models/sample.pt"
torch.save({
    'model_state_dict' : model.state_dict(),
}, PATH)

In [None]:
loss_f = nn.NLLLoss(ignore_index=char_to_id['<PAD>'])
show_sample(model, val_iter, loss_f, device='cuda')

## *Overfitting Test*

In [None]:
print("===INPUT===")
for c in model_input[0]:
    if(c==0): break
    print(id_to_char[c], end='')
print("\n===TARGET===")
for c in target[0]:
    if c==0:
        print('_', end='')
    else:
        print(id_to_char[c], end='')

# e.g.
# ===INPUT===
# <CLS>i have <MASK><MASK>n into th<MASK>s <MASK>ro<MASK>lem with man<MASK> other<MASK><MASK>octo<MASK>s a?d i <MASK>us<MASK> don't get<MASK><MASK><MASK>.<SEP>
# ===TARGET===
# ________ru_________i__p__b_________m__y______ d____r___n____j__t__________ it___________________________________________________________________________________________________________________________________________________________________________________

In [None]:
class DummyModel(nn.Module):
    def __init__(self):
        super(DummyModel, self).__init__()
        vocab_size = len(id_to_char)
        embed_size = 768 # same as BERT
        pad_idx = char_to_id['<PAD>']
        self.embedding_layer = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=8, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
        self.prediction_layer = nn.Linear(embed_size, vocab_size)
    
    def forward(self, x, mask):
        embedded_x = self.embedding_layer(x)
        encoded_x = self.transformer_encoder(embedded_x, src_key_padding_mask=mask)
        output = self.prediction_layer(encoded_x)
        return output

In [None]:
dummy_model = DummyModel()
optimizer = optim.Adam(dummy_model.parameters(), lr=1e-3, weight_decay = 0.0)
loss_f = nn.NLLLoss(ignore_index=char_to_id['<PAD>'])

In [None]:
test_input = input_ids
test_mask = encoder_mask

In [None]:
for i in range(50):
    optimizer.zero_grad()
    test_output = dummy_model(test_input, test_mask)
    pred_output = test_output.permute([0,2,1])
    train_mask = torch.ones(test_input.shape, dtype=torch.long)
    train_mask[:,2] = 0 # Prevent model from learning index 2
    target = test_input*train_mask
    loss = loss_f(pred_output, target)
    print(loss)
    loss.backward()
    optimizer.step()

In [None]:
optimizer.zero_grad()
test_output = dummy_model(test_input, test_mask)
pred_output = test_output.permute([0,2,1])

In [None]:
result = torch.argmax(pred_output,dim=1)

In [None]:
print(test_output[0][0])

In [None]:
print(test_input.shape)
print(result[2])

In [None]:
print(input_ids[2])
print(encoder_mask[0])

## *Below are not used*

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

In [None]:
refined_data = []
for text in train_yelp_data['text']:
    text = text.lower()
    i = 0
    for j, c in enumerate(text):
        if i==j and c==' ':
            i+=1
        if c in ['.','?','!']:
            if j-i>1:
                refined_data.append(text[i:j])
            i=j+1
print(len(refined_data))
#print(refined_data)

space = "<SPACE>"
data = []
for sentence in refined_data:
    d = ["<CLS>"]
    for c in sentence:
        if c == ' ':
            d.append(space)
        else:
            d.append(c)
    d.append("<SEP>")
    data.append(d)
    
input_ids = []
for sentence in data:
    d = [char_to_id[c] for c in sentence]
    input_ids.append(d)

In [None]:
# For overfitting
# Took me 3days to overfit a SINGLE sentence with the SAME masking :)
def generateData(iid, mask, device='cpu'):
    input_ids = iid.clone().detach().to(device=device) # input_ids is the generated data; iid becomes target
    rand = torch.rand(iid.shape, device=device)
    # MASK
    mask_mask = [0]*256
    for i in [1,3,6,8,10,15,18,20,25,30,49]:
        mask_mask[i] = 1
    mask_mask = torch.tensor([mask_mask])
    mask_mask = mask_mask>=1
    input_ids[mask_mask] = char_to_id['<MASK>']
    
    used_mask = mask_mask
    iid[torch.logical_not(used_mask)] = char_to_id['<PAD>']
    
    return (input_ids, iid)