In [6]:
!pip install transformers sacremoses

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [25]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import XLMTokenizer, XLMWithLMHeadModel, AutoTokenizer
import random
import copy
import time
import re
import os.path as osp
import os

In [8]:
model_checkpoint = 'xlm-mlm-enfr-1024'
tokenizer = XLMTokenizer.from_pretrained(model_checkpoint)
pre_model = XLMWithLMHeadModel.from_pretrained(model_checkpoint)

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-enfr-1024 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load data here

In [10]:
!pwd

/content


In [11]:
data_en = open("/content/drive/MyDrive/Colab Notebooks/mt_hw2/hansards.e", encoding='utf-8').read().split('\n')
data_fr = open("/content/drive/MyDrive/Colab Notebooks/mt_hw2/hansards.f", encoding='utf-8').read().split('\n')


raw_data = {'en': [line for line in data_en], 'fr': [line for line in data_fr]}

df = pd.DataFrame(raw_data)

# Preprocess data

In [12]:
"""
# Tokenizer
#model_checkpoint = 't5-small'
#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, model_max_length=512)

# Function for mapping data from strings to tokens
# s_key = source key, t_key = target_key, max_length = max sentence length
def preprocess_data(df, s_key, t_key, max_length):
    s = [sentence for sentence in df[s_key]]
    t = [sentence for sentence in df[t_key]]
    
    model_input = tokenizer(s, max_length=max_length, truncation=True, padding=True, return_tensors='pt') 
    
    with tokenizer.as_target_tokenizer():
        target_tokens = tokenizer(t, truncation=True, padding=True, max_length=max_length, return_tensors='pt') 
        
    model_input['target'] = target_tokens['input_ids']
   
    return model_input
"""
def tokenize_data(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=maxlen
    )
    
    return enc_di


# Tokenize data
en_data = tokenize_data(df['en'], tokenizer)
fr_data = tokenize_data(df['fr'], tokenizer)

# Function for mapping data from strings to tokens
# s_key = source key, t_key = target_key, max_length = max sentence length
def preprocess_data(df, s_key, t_key, max_length):
    df = df[df[s_key].apply(lambda x: isinstance(x, str))]
    df = df[df[t_key].apply(lambda x: isinstance(x, str))]

    df[s_key] = df[s_key].map(pad_punct)
    df[t_key] = df[t_key].map(pad_punct)

    s = list(df[s_key].values)
    t = list(df[t_key].values)
    
    model_input = tokenizer(s, max_length=max_length, truncation=True, padding=True, return_tensors='pt') 
    
    with tokenizer.as_target_tokenizer():
        target_tokens = tokenizer(t, truncation=True, padding=True, max_length=max_length, return_tensors='pt') 
        
    model_input['target'] = target_tokens['input_ids']
    
    return model_input, df

def pad_punct(s):
    s = re.sub('([.,!?()])', r' \1 ', s)
    return re.sub('\s{2,}', ' ', s)




In [13]:
class EnFrDataset(Dataset):
    def __init__(self, df, src_col='en', trg_col='fr'):
        self.df = df
        self.src_col = src_col
        self.trg_col = trg_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        source = self.df[self.src_col][idx]
        target = self.df[self.trg_col][idx]
        return source, target

In [14]:
# Example of tokenize output
#print(en_data['input_ids'][1])
#print(df_small['en'][1]) # "the" occures on idx 0
#print(df_small['en'][2]) # "the" occures on idx 13
#print(en_data['input_ids'][2])

## All transformer classes

In [15]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        
        self.embed = nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embed(x)


class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=512):
        super().__init__()
        
        self.d_model = d_model
        
        # Allocate memory to 
        pe = torch.zeros((max_seq_len, d_model))
        
        ### From attention is all you need ###
        for pos in range(max_seq_len):
            for i in range(0,d_model,2):
                pe[pos, i] = np.sin(pos/10000**(2*i/self.d_model))
                pe[pos, i+1] = np.cos(pos/10000**(2*i/self.d_model))
        # Fixed positional encoding
        pe.requires_grad = False
        #pe = pe.unsqueeze(0) # Make pe into [batch size x seq_len x d_model]
        self.register_buffer('pe',pe)
        
    def forward(self,x):

        # Make embeddings larger
        x = x*np.sqrt(self.d_model)
        # Get sequence length
        seq_len = x.size(1)
        
        pe = self.pe.clone()
        pe = pe.unsqueeze(0)
        
        x = x + torch.autograd.Variable(pe[:,:seq_len], 
        requires_grad=False)
        return x


def Attention(Q, K, V, d_k, mask=None, dropout=None):

    vals = (Q @ K.transpose(-2,-1))/np.sqrt(d_k)
    
    # Mask the scores if mask is specified. Model cannot see into future if masked.
    if mask is not None:
        mask = mask.unsqueeze(1)
        vals = vals.masked_fill(mask, 1e-9)
    # vals = vals if mask is None else vals.masked_fill_(mask, 1e-4)
    
    softmax = nn.Softmax(dim=-1)
    vals = softmax(vals)
    
    # apply dropout if specified
    vals = vals if dropout is None else dropout(vals)
    
    out =  vals @ V
    return out


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, d_model, d_k, dropout=.1, relative = False):
        super().__init__()

        self.n_heads = n_heads
        self.d_model = d_model
        # self.seq_len = seq_len
        self.d_k = d_k
        
        self.linearQ = nn.Linear(d_model, d_model)
        self.linearK = nn.Linear(d_model, d_model)
        self.linearV = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)
    
    # d_model = 512
    # n_heads = 8
    # seq_len = 20
    
    # [20,512] --> [20, 8, 64]
    ## If batch size is used, say of 128:
    ## out = [128, 20, 8, 64]
    
    # Input = Matrix of dim [bs x seq_len x d_model]
    def split_heads(self, t):
        return t.reshape(t.size(0), -1, self.n_heads, int(self.d_k))
    # Output = Matrix of dim [bs x seq_len x n_heads x d_k]
    
    def forward(self, Q, K, V, mask = None):
        
        Q = self.linearQ(Q)
        K = self.linearK(K)
        V = self.linearV(V)
        
        Q, K, V = [self.split_heads(t) for t in (Q, K, V)] 
        Q, K, V = [t.transpose(1,2) for t in (Q, K, V)] # reshape to [bs x n_heads x seq_len x d_k]
        
        # Compute Attention
        vals = Attention(Q, K, V, self.d_k, mask, self.dropout)
        
        # Reshape to [bs x seq_len x d_model]
        vals = vals.transpose(1,2).contiguous().view(vals.size(0), -1, self.d_model)
       
        out = self.out(vals) # linear
        return out
    
class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff, dropout=.1):
        super().__init__()
        
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        return self.linear2(self.dropout(F.relu(self.linear1(x))))
    

        
class EncoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, d_k, dropout=.1):
        
        super().__init__()
        self.attention = MultiHeadAttention(n_heads, d_model, d_k, dropout)
        self.ffns = FeedForwardNetwork(d_model, d_ff, dropout)
        
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        # See "Attention is all you need" to follow code structure
        x2 = self.dropout1(self.attention(x, x, x, mask))
        x = self.layer_norm1(x) + self.layer_norm1(x2)
        
        x2 = self.dropout2(self.ffns(x))
        x = x + self.layer_norm2(x2)
    
        return x
    

class DecoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, d_ff, d_k, dropout=.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        
        self.attention = MultiHeadAttention(n_heads, d_model, d_k, dropout)
        self.ffns = FeedForwardNetwork(d_model, d_ff, dropout)
        
        # Batch Normalization
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        
        # Dropout
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
        # self.linear = nn.Linear()
        
    def forward(self, x, e_out, source_mask, target_mask):
        
        # See "Attention is all you need" to follow code structure
        ## part 1
        x2 = self.layer_norm1(x) # Norm
        x = self.dropout1(self.attention.forward(x2, x2, x2, target_mask)) # Masked MHA, target
        x = x2 + self.layer_norm1(x) # Add & Norm
        
        ## part 2
        x3 = self.dropout2(self.attention.forward(x, e_out, e_out, source_mask)) # MHA on encoder output
        x2 = self.dropout2(self.attention.forward(x, x, x)) #MHA continued in decoder
        x = self.layer_norm1(x3) + self.layer_norm1(x2) + self.layer_norm1(x) # Add & Norm
        
        ## part 3
        x2 = self.dropout3(self.ffns.forward(x)) ## Feed forward
        x = x + self.layer_norm2(x2) # add
        # x = self.norm3(x) # norm (!!!CHECK IF THIS IS EQUIVALENT!!!)
        return x

def cloneLayers(module, n_layers):
    return nn.ModuleList([copy.deepcopy(module) for i in range(n_layers)])

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, d_ff, d_k, n_layers, n_heads, dropout=.1):
        super().__init__()
        self.n_layers = n_layers
        self.embedder = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.e_layers = cloneLayers(EncoderLayer(n_heads, d_model, d_ff, d_k), n_layers)
        
    def forward(self, source, mask=None):
        x = self.embedder.forward(source)
        x = self.pe.forward(x)
        for i in range(self.n_layers):
            x = self.e_layers[i](x, mask)
        
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, d_ff, d_k, n_layers, n_heads, dropout=.1):
        super().__init__()
        self.n_layers = n_layers
        self.embedder = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.d_layers = cloneLayers(DecoderLayer(n_heads, d_model, d_ff, d_k), n_layers)
        
    
    def forward(self, trg, e_out, source_mask, target_mask):
        x = self.embedder.forward(trg)
        x = self.pe.forward(x)
        
        for i in range(self.n_layers):
            x = self.d_layers[i](x, e_out, source_mask, target_mask)
        
        return x
        
class AlignmentLayer(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, d_model, d_ff, d_k, n_layers, n_heads):
        super().__init__()
        self.MHA = MultiHeadAttention(n_heads, d_model, d_k)
        self.e = Encoder(source_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.d = Decoder(target_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.linear_f = nn.Linear(d_model, target_vocab_size) 
        
    def forward(self, source, target, source_mask, target_mask):
        e_out = self.e.forward(source, source_mask)
        d_out = self.d.forward(target, e_out, source_mask, target_mask)
        
        out = self.MHA.forward(d_out, e_out, e_out)
        
        
        
class Transformer(nn.Module):
    def __init__(self, source_vocab_size, target_vocab_size, d_model,d_ff, d_k, n_layers, n_heads):
        super().__init__()
        self.e = Encoder(source_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.d = Decoder(target_vocab_size, d_model,d_ff, d_k, n_layers, n_heads)
        self.linear_f = nn.Linear(d_model, target_vocab_size)
        
    def forward(self, source, target, source_mask, target_mask):
        e_out = self.e.forward(source, source_mask)
        d_out = self.d.forward(target, e_out, source_mask, target_mask)
        
        out = self.linear_f(d_out)
        return out
        


In [16]:
def get_target_mask(size, target):
    mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float(0)).masked_fill(mask == 1, float(1))
    
    # Find out where the target pad starts
    trg_pad = (target==0).nonzero()
    
    # Check if there is no padding in sentence
    if len(trg_pad) == 0:
        stop_idx = size
        
    else:
        stop_idx = trg_pad[0][1].item()
        mask[stop_idx:, :] = -69
        
    return mask.unsqueeze(0) > 0

In [17]:
def get_source_mask(size, source):
    src_pad = (source==2).nonzero()
    
    if len(src_pad) == 0:
        stop_idx = size
        
    else:
        stop_idx = src_pad[0][1].item()
        
    mask = source.clone()
    # Mask all padding with -inf
    mask[:,stop_idx:] = 1
    # Convert everything before stop_idx to zero
    mask[:,:stop_idx] = 0
    
    mask = mask.unsqueeze(0) > 0
    
    return mask

In [18]:
#print("Testing masking function for new tokenizer...")

#en1 = en_data['input_ids'][1]
#en1.size()
#pad = (en1==2).nonzero()
#pad[0].item()
#en1_msk = get_source_mask(en1.size(), en1.unsqueeze(0))
#en1_new = en1


#print("source:\n", en1)
#en1_new = en1_new.masked_fill(en1_msk, 1e-5)
#print("mask:\n", en1_msk)
#en1_new = en1_new.masked_fill(en1_msk, -1)
#print("after masking:\n", en1_new)


#fr1 = fr_data['input_ids'][1]
#fr1_msk = get_source_mask(fr1.size(), fr1.unsqueeze(0))
#fr1_new = fr1

#fr1_new = fr1_new.masked_fill(fr1_msk, -1)
#fr1_new

# Define train function

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, dataloader, epochs, print_every=1e3, verbose=True):
    model.to(device)
    model.train()
    start = time.time()
    tot_time = 0
    total_loss = 0
    loss_list = []
    
    
    num_sentences = len(dataloader)
    # loop over epochs
    for epoch in range(epochs):
        print("\nEPOCH " +str(epoch+1)+" of "+str(epochs)+"\n")
        # loop over all sentences
        for i, (src, target) in enumerate(dataloader):        
            print(src.size())    
            # unsqueeze to avoid dim mismatch between embedder and pe
            # src = src.unsqueeze(0).to(device)
            # trg = src.unsqueeze(0).to(device)
            # # target input, remove last word
            # trg_input = trg[:, :-1]
            
            # # get targets
            # y = trg[:, 1:].contiguous().view(-1)
            
            # src_mask = get_source_mask(src.size(1), src).to(device)
            # trg_mask = get_target_mask(trg_input.size(1), trg_input).to(device)
            
            # preds = model.forward(src, trg_input, src_mask, trg_mask)
            # optim.zero_grad()    
            # loss = F.cross_entropy(preds.view(-1, preds.size(-1)), y)
            # loss.backward()
            # optim.step()
            # total_loss += loss.item()
            if verbose and i % print_every == 0:
                print("sentence:\t",i+1,"\ntime per batch:\t",np.round(time.time()-start, 2)/(i+1), "\nloss:\t", np.round(loss.item(),2), "\naverage loss:\t", np.round(total_loss,2)/(i+1)) 
        loss_list.append(total_loss/num_sentences)
        scheduler.step()
        end = time.time()
        elapsed = end-start
        tot_time += elapsed
        avg_time = tot_time/(epoch+1)
        est_remain = epochs*avg_time - tot_time
        print(f'Train Loss: {loss_list[epoch]:.4f}')
        print(f'Epoch took {elapsed:.1f}s')
        print(f'Estimated {(est_remain/60):.1f}m left')
    

In [20]:
### Define arguments ### (same as in "Attention is all you need")
d_model = 1024 # Dimension of embeddings
n_heads = 8 # Number of heads for MHA
d_k = d_model/n_heads # dimension of keys (d_model / n_heads)
d_ff = d_model*4 # DON'T CHANGE!!! (be careful)

n_layers = 6 # Number of model layers
epochs = 3

src_vocab = set()
trg_vocab = set()
df['en'].str.split().apply(src_vocab.update)
df['fr'].str.split().apply(trg_vocab.update)


src_vocab_size = len(src_vocab)
trg_vocab_size = len(trg_vocab)
vocab_size = src_vocab_size + trg_vocab_size

vocab_size = vocab_size # Number of (unique) words in dataset
print("!!!REMEMBER TO CHANGE EPOCHS!!!")


!!!REMEMBER TO CHANGE EPOCHS!!!


# Define model and optimizer

In [21]:
pre_vocab_size = 64139
model = Transformer(pre_vocab_size, pre_vocab_size, d_model, d_ff, d_k, n_layers, n_heads)

lr = 0.00001 # 0.0001 default in "AIAYN"
optim = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=10, gamma=0.1)

# Get pre-trained model weights

In [22]:
#print("before weight initialization:")
#print(model.state_dict().values())

In [23]:
pre_model_weights = pre_model.state_dict()
sd = model.state_dict().copy()
model_weights = model.state_dict()

count = 0

with torch.no_grad():
    for j in ['e.e_layers', 'd.d_layers']:
        for i in range(n_layers):
            # Embedding and Positional Encoding
            sd['e.embedder.embed.weight'] = pre_model_weights['transformer.embeddings.weight']
            sd['e.pe.pe'] = pre_model_weights['transformer.position_embeddings.weight']

            # Attention layers
            sd[f'{j}.{i}.attention.linearQ.weight']  = pre_model_weights[f'transformer.attentions.{i}.q_lin.weight']
            sd[f'{j}.{i}.attention.linearQ.bias']    = pre_model_weights[f'transformer.attentions.{i}.q_lin.bias']
            sd[f'{j}.{i}.attention.linearK.weight']  = pre_model_weights[f'transformer.attentions.{i}.k_lin.weight']
            sd[f'{j}.{i}.attention.linearK.bias']    = pre_model_weights[f'transformer.attentions.{i}.k_lin.bias']
            sd[f'{j}.{i}.attention.linearV.weight']  = pre_model_weights[f'transformer.attentions.{i}.v_lin.weight']
            sd[f'{j}.{i}.attention.linearV.bias']    = pre_model_weights[f'transformer.attentions.{i}.v_lin.bias']
            sd[f'{j}.{i}.attention.out.weight']   = pre_model_weights[f'transformer.attentions.{i}.out_lin.weight']
            sd[f'{j}.{i}.attention.out.bias']     = pre_model_weights[f'transformer.attentions.{i}.out_lin.bias']

            # Feed forwards
            sd[f'{j}.{i}.ffns.linear1.weight'] = pre_model_weights[f'transformer.ffns.{i}.lin1.weight']
            sd[f'{j}.{i}.ffns.linear1.bias'] = pre_model_weights[f'transformer.ffns.{i}.lin1.bias']
            sd[f'{j}.{i}.ffns.linear2.weight'] = pre_model_weights[f'transformer.ffns.{i}.lin2.weight']
            sd[f'{j}.{i}.ffns.linear2.bias'] = pre_model_weights[f'transformer.ffns.{i}.lin2.bias']

            # Layer_norm1 = attention
            sd[f'{j}.{i}.layer_norm1.weight'] = pre_model_weights[f'transformer.layer_norm1.{i}.weight']
            sd[f'{j}.{i}.layer_norm1.bias'] = pre_model_weights[f'transformer.layer_norm1.{i}.bias']

            #Layer_norm2 = FFN
            sd[f'{j}.{i}.layer_norm2.weight'] = pre_model_weights[f'transformer.layer_norm2.{i}.weight']
            sd[f'{j}.{i}.layer_norm2.bias'] = pre_model_weights[f'transformer.layer_norm2.{i}.bias']

            # prediction layer
            sd['linear_f.weight'] = pre_model_weights['pred_layer.proj.weight']
            sd['linear_f.bias'] = pre_model_weights['pred_layer.proj.bias']
            
            
            # fix
            count += 1
            if count >= len(list(model.state_dict().keys())):
                break
        



In [24]:
model.load_state_dict(sd)

<All keys matched successfully>

# Train the model

In [48]:
dataset = EnFrDataset(df)
dataloader = Dataloader(dataset, batch_size=64, shuffle=True)


100001

In [58]:
train_model(model, dataloader, epochs, verbose=True)


EPOCH 1 of 3

sentence:	 1 
time per batch:	 0.15 
loss:	 8.67 
average loss:	 8.67
sentence:	 1001 
time per batch:	 0.13882117882117884 
loss:	 2.12 
average loss:	 3.5376823176823176
sentence:	 2001 
time per batch:	 0.13877561219390305 
loss:	 2.32 
average loss:	 2.826416791604198
sentence:	 3001 
time per batch:	 0.138757080973009 
loss:	 0.92 
average loss:	 2.5186604465178273
sentence:	 4001 
time per batch:	 0.13875281179705073 
loss:	 2.54 
average loss:	 2.319162709322669


KeyboardInterrupt: ignored

In [None]:
filename = 'drive/MyDrive/Colab Notebooks/mt_hw2/EN-FR_translation_weights.pt'
torch.save(model.state_dict(), filename)