In [170]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
import random
import copy
import time

In [171]:
from transformers import XLMTokenizer, XLMWithLMHeadModel

model_checkpoint = 'xlm-mlm-enfr-1024'
tokenizer = XLMTokenizer.from_pretrained(model_checkpoint)
pre_model = XLMWithLMHeadModel.from_pretrained(model_checkpoint)

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-enfr-1024 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Load data here

In [172]:
data_en = open("data/hansards.e", encoding='utf-8').read().split('\n')
data_fr = open("data/hansards.f", encoding='utf-8').read().split('\n')


raw_data_en = {'en': [line for line in data_en]}
raw_data_fr = {'fr': [line for line in data_fr]}

df_en = pd.DataFrame(raw_data_en, columns = ['en'])
df_fr = pd.DataFrame(raw_data_fr, columns = ['fr'])

df_small = df[['en', 'fr']][:100]

# Preprocess data

In [173]:
"""
# Tokenizer
#model_checkpoint = 't5-small'
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)

# Function for mapping data from strings to tokens
# s_key = source key, t_key = target_key, max_length = max sentence length
def preprocess_data(df, s_key, t_key, max_length):
    s = [sentence for sentence in df[s_key]]
    t = [sentence for sentence in df[t_key]]
    
    model_input = tokenizer(s, max_length=max_length, truncation=True, padding=True, return_tensors='pt') 
    
    with tokenizer.as_target_tokenizer():
        target_tokens = tokenizer(t, truncation=True, padding=True, max_length=max_length, return_tensors='pt') 
        
    model_input['target'] = target_tokens['input_ids']
   
    return model_input
"""



def tokenize_data(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=maxlen
    )
    
    return enc_di


# Tokenize data
en_data = tokenize_data(df_small['en'], tokenizer)
fr_data = tokenize_data(df_small['fr'], tokenizer)

In [174]:
# Example of tokenize output
#print(en_data['input_ids'][1])
#print(df_small['en'][1]) # "the" occures on idx 0
#print(df_small['en'][2]) # "the" occures on idx 13
#print(en_data['input_ids'][2])

## All transformer classes

In [175]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        
        self.embed = nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embed(x)


class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=512):
        super().__init__()
        
        self.d_model = d_model
        
        # Allocate memory to 
        pe = torch.zeros((max_seq_len, d_model))
        
        ### From attention is all you need ###
        for pos in range(max_seq_len):
            for i in range(0,d_model,2):
                pe[pos, i] = np.sin(pos/10000**(2*i/self.d_model))
                pe[pos, i+1] = np.cos(pos/10000**(2*i/self.d_model))
        # Fixed positional encoding
        pe.requires_grad = False
        #pe = pe.unsqueeze(0) # Make pe into [batch size x seq_len x d_model]
        self.register_buffer('pe',pe)
        
    def forward(self,x):

        # Make embeddings larger
        x = x*np.sqrt(self.d_model)
        # Get sequence length
        seq_len = x.size(1)
        
        pe = self.pe.clone()
        pe = pe.unsqueeze(0)
        
        x = x + torch.autograd.Variable(pe[:,:seq_len], 
        requires_grad=False)
        return x


def Attention(Q, K, V, d_k, mask=None, dropout=None):

    vals = (Q @ K.transpose(-2,-1))/np.sqrt(d_k)
    
    # Mask the scores if mask is specified. Model cannot see into future if masked.
    if mask is not None:
        mask = mask.unsqueeze(1)
        vals = vals.masked_fill(mask, -1000) # changed from 1e-4 
    # vals = vals if mask is None else vals.masked_fill_(mask, 1e-4)
    
    softmax = nn.Softmax(dim=-1)
    vals = softmax(vals)
    
    # apply dropout if specified
    vals = vals if dropout is None else dropout(vals)
    
    out =  vals @ V
    return out


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, dropout=.1, relative = False):
        super().__init__()

        self.n_heads = n_heads
        self.d_model = d_model
        # self.seq_len = seq_len
        self.d_k = d_k
        
        self.linearQ = nn.Linear(d_model, d_model)
        self.linearK = nn.Linear(d_model, d_model)
        self.linearV = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    # d_model = 512
    # n_heads = 8
    # seq_len = 20
    
    # [20,512] --> [20, 8, 64]
    ## If batch size is used, say of 128:
    ## out = [128, 20, 8, 64]
    
    # Input = Matrix of dim [bs x seq_len x d_model]
    def split_heads(self, t):
        return t.reshape(t.size(0), -1, self.n_heads, int(self.d_k))
    # Output = Matrix of dim [bs x seq_len x n_heads x d_k]
    
    def forward(self, Q, K, V, mask = None):
        
        Q = self.linearQ(Q)
        K = self.linearK(K)
        V = self.linearV(V)
        
        Q, K, V = [self.split_heads(t) for t in (Q, K, V)] 
        Q, K, V = [t.transpose(1,2) for t in (Q, K, V)] # reshape to [bs x n_heads x seq_len x d_k]
        
        # Compute Attention
        vals = Attention(Q, K, V, self.d_k, mask, self.dropout)
        
        # Reshape to [bs x seq_len x d_model]
        vals = vals.transpose(1,2).contiguous().view(vals.size(0), -1, self.d_model)
       
        out = self.out(vals) # linear
        return out
    
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff, dropout=.1):
        super().__init__()
        
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        return self.linear2(self.dropout(F.relu(self.linear1(x))))
    

        
class EncoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, d_k, dropout=.1):
        
        super().__init__()
        self.attention = MultiHeadAttention(n_heads, d_model, d_k, dropout)
        self.ffns = FeedForwardNetwork(d_model, d_ff, dropout)
        
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # See "Attention is all you need" to follow code structure
        x2 = self.dropout1(self.attention(x, x, x, mask))
        x = self.layer_norm1(x) + self.layer_norm1(x2)
        
        x2 = self.dropout2(self.ffns(x))
        x = x + self.layer_norm2(x2)
    
        return x
    

class DecoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, d_k, dropout=.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        
        self.attention = MultiHeadAttention(n_heads, d_model, d_k, dropout)
        self.ffns = FeedForwardNetwork(d_model, d_ff, dropout)
        
        # Batch Normalization
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
        # self.linear = nn.Linear()
        
    def forward(self, x, e_out, source_mask, target_mask):
        
        # See "Attention is all you need" to follow code structure
        ## part 1
        x2 = self.layer_norm1(x) # Norm
        x = self.dropout1(self.attention.forward(x2, x2, x2, target_mask)) # Masked MHA, target
        x = x2 + self.layer_norm1(x) # Add & Norm
        
        ## part 2
        x3 = self.dropout2(self.attention.forward(x, e_out, e_out, source_mask)) # MHA on encoder output
        x2 = self.dropout2(self.attention.forward(x, x, x)) #MHA continued in decoder
        x = self.layer_norm1(x3) + self.layer_norm1(x2) + self.layer_norm1(x) # Add & Norm
        
        ## part 3
        x2 = self.dropout3(self.ffns.forward(x)) ## Feed forward
        x = x + self.layer_norm2(x2) # add
        # x = self.norm3(x) # norm (!!!CHECK IF THIS IS EQUIVALENT!!!)
        return x

def cloneLayers(module, n_layers):
    return nn.ModuleList([copy.deepcopy(module) for i in range(n_layers)])

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, d_ff, d_k, n_layers, n_heads, dropout=.1):
        super().__init__()
        self.n_layers = n_layers
        self.embedder = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.e_layers = cloneLayers(EncoderLayer(n_heads, d_model, d_ff, d_k), n_layers)
        
    def forward(self, source, mask=None):
        x = self.embedder.forward(source)
        x = self.pe.forward(x)
        for i in range(self.n_layers):
            x = self.e_layers[i](x, mask)
        
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, d_ff, d_k, n_layers, n_heads, dropout=.1):
        super().__init__()
        self.n_layers = n_layers
        self.embedder = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.d_layers = cloneLayers(DecoderLayer(n_heads, d_model, d_ff, d_k), n_layers)
        
    
    def forward(self, trg, e_out, source_mask, target_mask):
        x = self.embedder.forward(trg)
        x = self.pe.forward(x)
        
        for i in range(self.n_layers):
            x = self.d_layers[i](x, e_out, source_mask, target_mask)
        
        return x
        
class AlignmentLayer(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, d_model, d_ff, d_k, n_layers, n_heads):
        super().__init__()
        self.MHA = MultiHeadAttention(n_heads, d_model, d_k)
        self.e = Encoder(source_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.d = Decoder(target_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.linear_f = nn.Linear(d_model, target_vocab_size) 
        
    def forward(self, source, target, source_mask, target_mask):
        e_out = self.e.forward(source, source_mask)
        d_out = self.d.forward(target, e_out, source_mask, target_mask)
        
        out = self.MHA.forward(d_out, e_out, e_out)
        
        
        
class Transformer(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, d_model,d_ff, d_k, n_layers, n_heads):
        super().__init__()
        self.e = Encoder(source_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.d = Decoder(target_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.linear_f = nn.Linear(d_model, target_vocab_size)
        
    def forward(self, source, target, source_mask, target_mask):
        e_out = self.e.forward(source, source_mask)
        d_out = self.d.forward(target, e_out, source_mask, target_mask)
        
        out = self.linear_f(d_out)
        return out
        


In [176]:
def get_target_mask(size, target):
    mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float(0)).masked_fill(mask == 1, float(1))
    
    # Find out where the target pad starts
    trg_pad = (target==0).nonzero()
    
    # Check if there is no padding in sentence
    if len(trg_pad) == 0:
        stop_idx = size
        
    else:
        stop_idx = trg_pad[0][1].item()
        mask[stop_idx:, :] = -69
        
    return mask.unsqueeze(0) > 0, stop_idx

In [177]:
def get_source_mask(size, source):
    src_pad = (source==2).nonzero()
    
    if len(src_pad) == 0:
        stop_idx = size
        
    else:
        stop_idx = src_pad[0][1].item()
        
    mask = source.clone()
    # Mask all padding with -inf
    mask[:,stop_idx:] = 1
    # Convert everything before stop_idx to zero
    mask[:,:stop_idx] = 0
    
    mask = mask.unsqueeze(0) > 0
    
    return mask

In [178]:
#print("Testing masking function for new tokenizer...")

#en1 = en_data['input_ids'][1]
#en1.size()
#pad = (en1==2).nonzero()
#pad[0].item()
#en1_msk = get_source_mask(en1.size(), en1.unsqueeze(0))
#en1_new = en1


#print("source:\n", en1)
#en1_new = en1_new.masked_fill(en1_msk, 1e-5)
#print("mask:\n", en1_msk)
#en1_new = en1_new.masked_fill(en1_msk, -1)
#print("after masking:\n", en1_new)


#fr1 = fr_data['input_ids'][1]
#fr1_msk = get_source_mask(fr1.size(), fr1.unsqueeze(0))
#fr1_new = fr1

#fr1_new = fr1_new.masked_fill(fr1_msk, -1)
#fr1_new

# Define train function

In [186]:
print_every = 10
def train_model(model, source_data, target_data, epochs, verbose=False):
    print("Training transformer")
    model.train()
    start = time.time()
    total_loss = 0
    loss_list = []
    
    source_all = source_data['input_ids']
    target_all = target_data['input_ids']
    
    # loop over epochs
    count = 0
    for epoch in range(epochs):
        print("epoch", epoch+1)
        # loop over all sentences
        for i in range(len(source_all)):
            if verbose and i % print_every == 0:
                print("sentece", i)
            
            # unsqueeze to avoid dim mismatch between embedder and pe
            src = source_all[i].unsqueeze(0) 
            trg = target_all[i].unsqueeze(0)
            
            # target input, remove last word
            trg_input = trg[:, :-1]
            
            # get targets
            y = trg[:, 1:].contiguous().view(-1)
            
            src_mask = get_source_mask(src.size(1), src)
            trg_mask, trg_stop_idx = get_target_mask(trg_input.size(1), trg_input)
            
            preds = model.forward(src, trg_input, src_mask, trg_mask)
            optim.zero_grad()    
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), y)
            loss.backward()
            optim.step()
            total_loss += loss.item()
            loss_list.append(loss.item())
            
            count += 1
            if verbose and i % print_every == 0:
                print("time:",np.round(time.time()-start, 2), "\navg loss:", np.round(total_loss/count, 2), "\ntotal loss:", np.round(total_loss,2)) 
    

In [187]:
### Define arguments ### (same as in "Attention is all you need")
d_model = 1024 # Dimension of embeddings
n_heads = 8 # Number of heads for MHA
d_k = d_model/n_heads # dimension of keys (d_model / n_heads)
d_ff = d_model*4 # DON'T CHANGE!!! (be careful)
vocab_size = len(df) # Number of (unique) words in dataset
n_layers = 6 # Number of model layers
epochs = 1


print("!!!REMEMBER TO CHANGE EPOCHS!!!")


!!!REMEMBER TO CHANGE EPOCHS!!!


# Define model and optimizer

In [188]:
pre_vocab_size = 64139
model = Transformer(pre_vocab_size, pre_vocab_size, d_model, d_ff, d_k, n_layers, n_heads)

lr = 0.00001 # 0.0001 default in "AIAYN"
optim = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

# Get pre-trained model weights

In [189]:
#print("before weight initialization:")
#print(model.state_dict().values())

In [190]:
pre_model_weights = pre_model.state_dict()
sd = model.state_dict().copy()
model_weights = model.state_dict()

count = 0

with torch.no_grad():
    for j in ['e.e_layers', 'd.d_layers']:
        for i in range(n_layers):
            # Embedding and Positional Encoding
            sd['e.embedder.embed.weight'] = pre_model_weights['transformer.embeddings.weight']
            sd['e.pe.pe'] = pre_model_weights['transformer.position_embeddings.weight']

            # Attention layers
            sd[f'{j}.{i}.attention.linearQ.weight']  = pre_model_weights[f'transformer.attentions.{i}.q_lin.weight']
            sd[f'{j}.{i}.attention.linearQ.bias']    = pre_model_weights[f'transformer.attentions.{i}.q_lin.bias']
            sd[f'{j}.{i}.attention.linearK.weight']  = pre_model_weights[f'transformer.attentions.{i}.k_lin.weight']
            sd[f'{j}.{i}.attention.linearK.bias']    = pre_model_weights[f'transformer.attentions.{i}.k_lin.bias']
            sd[f'{j}.{i}.attention.linearV.weight']  = pre_model_weights[f'transformer.attentions.{i}.v_lin.weight']
            sd[f'{j}.{i}.attention.linearV.bias']    = pre_model_weights[f'transformer.attentions.{i}.v_lin.bias']
            sd[f'{j}.{i}.attention.out.weight']   = pre_model_weights[f'transformer.attentions.{i}.out_lin.weight']
            sd[f'{j}.{i}.attention.out.bias']     = pre_model_weights[f'transformer.attentions.{i}.out_lin.bias']

            # Feed forwards
            sd[f'{j}.{i}.ffns.linear1.weight'] = pre_model_weights[f'transformer.ffns.{i}.lin1.weight']
            sd[f'{j}.{i}.ffns.linear1.bias'] = pre_model_weights[f'transformer.ffns.{i}.lin1.bias']
            sd[f'{j}.{i}.ffns.linear2.weight'] = pre_model_weights[f'transformer.ffns.{i}.lin2.weight']
            sd[f'{j}.{i}.ffns.linear2.bias'] = pre_model_weights[f'transformer.ffns.{i}.lin2.bias']

            # Layer_norm1 = attention
            sd[f'{j}.{i}.layer_norm1.weight'] = pre_model_weights[f'transformer.layer_norm1.{i}.weight']
            sd[f'{j}.{i}.layer_norm1.bias'] = pre_model_weights[f'transformer.layer_norm1.{i}.bias']

            #Layer_norm2 = FFN
            sd[f'{j}.{i}.layer_norm2.weight'] = pre_model_weights[f'transformer.layer_norm2.{i}.weight']
            sd[f'{j}.{i}.layer_norm2.bias'] = pre_model_weights[f'transformer.layer_norm2.{i}.bias']

            # prediction layer
            sd['linear_f.weight'] = pre_model_weights['pred_layer.proj.weight']
            sd['linear_f.bias'] = pre_model_weights['pred_layer.proj.bias']
            
            
            # fix
            count += 1
            if count >= len(list(model.state_dict().keys())):
                break
        



In [191]:
model.load_state_dict(sd)

<All keys matched successfully>

# Train the model

In [192]:
train_model(model, en_data, fr_data, epochs, True)

Training transformer
epoch 1
sentece 0
time: 14.06 
avg loss: 78.32 
total loss: 78.32
sentece 10
time: 152.0 
avg loss: 94.6 
total loss: 1040.56
sentece 20
time: 234.43 
avg loss: 95.92 
total loss: 2014.32
sentece 30
time: 303.38 
avg loss: 84.81 
total loss: 2629.01
sentece 40
time: 391.86 
avg loss: 77.99 
total loss: 3197.78
sentece 50
time: 487.13 
avg loss: 75.98 
total loss: 3874.77
sentece 60
time: 592.34 
avg loss: 72.58 
total loss: 4427.33
sentece 70
time: 684.17 
avg loss: 67.44 
total loss: 4788.12
sentece 80
time: 784.44 
avg loss: 63.51 
total loss: 5144.44
sentece 90
time: 876.19 
avg loss: 60.35 
total loss: 5491.66


In [None]:
filename = 'weights\EN-FR_translation_weights.pt'
torch.save(model.state_dict(), filename)

# Get model weights manually

In [56]:
model_weights = {}
for name, param in model.named_parameters():
    model_weights[name] = param.detach().numpy()
    



In [196]:
model_weights

OrderedDict([('e.embedder.embed.weight',
              tensor([[-0.0242, -0.0930,  0.0500,  ...,  0.1343, -0.0258, -0.0163],
                      [-0.0104, -0.0273,  0.0954,  ...,  0.0880, -0.0083,  0.0041],
                      [ 0.0391, -0.0052,  0.0927,  ...,  0.0465,  0.0390,  0.0123],
                      ...,
                      [-0.0285, -0.0183, -0.0318,  ...,  0.0191,  0.0552, -0.0374],
                      [-0.0028,  0.0298,  0.0414,  ..., -0.0129, -0.0147, -0.0737],
                      [ 0.0210, -0.0002,  0.0307,  ..., -0.0009,  0.0030, -0.0063]])),
             ('e.pe.pe',
              tensor([[-0.0242, -0.0930,  0.0500,  ...,  0.1343, -0.0258, -0.0163],
                      [-0.0104, -0.0273,  0.0954,  ...,  0.0880, -0.0083,  0.0041],
                      [ 0.0391, -0.0052,  0.0927,  ...,  0.0465,  0.0390,  0.0123],
                      ...,
                      [-0.0285, -0.0183, -0.0318,  ...,  0.0191,  0.0552, -0.0374],
                      [-0.0028,  0.02