# Multilingual Neural Machine Translation

1. Load pre-trained model & tokenizer
2. Load custom dataset
3. Convert dataset into inputs
4. Fine-tune/train model on custom dataset
5. Evaluate and test

In [1]:
# %pip install transformers sentencepiece datasets
# %pip install --upgrade jupyter
# %pip install --upgrade ipywidgets

In [2]:
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import tqdm
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, get_linear_schedule_with_warmup

sns.set()

In [3]:
# Load Model Checkpoint
# modelCheckpoint = 'google/mt5-small'
modelCheckpoint = 'google/mt5-base'

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(modelCheckpoint)

# Load Model
model = AutoModelForSeq2SeqLM.from_pretrained(modelCheckpoint)
model = model.cuda()

# Set max sequence length
max_seq_len = model.config.max_length

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
# Make sure model inference is working

input = "I been DOING this for most my LIFE with no ADVICE"

# Tokenize
tokenIDs = tokenizer.encode(input, return_tensors="pt").cuda()
tokenIDs

modelOutput = model.generate(tokenIDs)
print(modelOutput)



tensor([[     0, 250099,    332,    259,    262,    259,   4944,      1]],
       device='cuda:0')


In [5]:
outputText = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(modelOutput[0])
)
outputText

'<pad> <extra_id_0> for a while</s>'

In [6]:
# sorted(tokenizer.vocab.items(), key=lambda x : x[1])

In [7]:
# Load Dataset
dataset = load_dataset("alt")

In [8]:
trainDataset = dataset["train"]
testDataset = dataset["test"]

trainDataset[0]

{'SNT.URLID': '80188',
 'SNT.URLID.SNTID': '1',
 'url': 'http://en.wikinews.org/wiki/2007_Rugby_World_Cup:_Italy_31_-_5_Portugal',
 'translation': {'bg': 'ফ্রান্সের প্যারিসের পার্ক দি প্রিন্সেস-এ হওয়া ২০০৭-এর রাগবি বিশ্বকাপের পুল সি-তে ইটালি পর্তুগালকে ৩১-৫ গোলে হারিয়েছে।',
  'en': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes, Paris, France.',
  'en_tok': 'Italy have defeated Portugal 31-5 in Pool C of the 2007 Rugby World Cup at Parc des Princes , Paris , France .',
  'fil': 'Natalo ng Italya ang Portugal sa puntos na 31-5 sa Grupong C noong 2007 sa Pandaigdigang laro ng Ragbi sa Parc des Princes, Paris, France.',
  'hi': '2007 में फ़्रांस, पेरिस के पार्क डेस प्रिंसेस में हुए रग्बी विश्व कप के पूल C में इटली ने पुर्तगाल को 31-5 से हराया।',
  'id': 'Italia berhasil mengalahkan Portugal 31-5 di grup C dalam Piala Dunia Rugby 2007 di Parc des Princes, Paris, Perancis.',
  'ja': 'フランスのパリ、パルク・デ・プランスで行われた2007年ラグビーワールドカップのプールCで、イタリアは31対5でポルトガルを

In [9]:
# Mapping of language tokens we want

languageTokenMapping = {
    'en': "<en>",
    'fil': "<fil>",
    'vi': "<viet>",
    'zh': "<zh>"
}

# Adding the special tokens to tokenizer
specialTokens = {'additional_special_tokens': list(languageTokenMapping.values())}
tokenizer.add_special_tokens(specialTokens)
model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)

Embedding(250112, 768)

In [10]:
# Tokenize
tokenIDs = tokenizer.encode(
    input, return_tensors="pt", padding='max_length', 
    truncation=True, max_length=max_seq_len).cuda()

print(tokenIDs)

tensor([[   336,   2101,   9498,   9491,    714,    332,   2250,   1037,  84145,
            514,    375, 104429,  45533,      1,      0,      0,      0,      0,
              0,      0]], device='cuda:0')


Setup functions for training

In [11]:
# Take data from data set, transform it and train model

def encodeInput(text, targetLang, tokenizer, sequenceLength, languageTokenMapping = languageTokenMapping):
    
    targetLanguageToken = languageTokenMapping[targetLang]

    # Tokenize the input text
    inputIDs = tokenizer.encode(
        text = targetLanguageToken + text,
        return_tensors = "pt",
        padding = "max_length",
        truncation = True,
        max_length = sequenceLength
        )
    
    return inputIDs

# Merge these two function later
# Merge these two function later

def encodeTarget(text, tokenizer, sequenceLength):

    tokenIDs = tokenizer.encode(
        text = text,
        return_tensors = "pt",
        padding = "max_length",
        truncation = True,
        max_length = sequenceLength)
    
    return tokenIDs

def formatTranslationData(translations, languageTokenMapping, tokenizer, sequenceLength=256):

    # Choose 2 languages at random
    languages = list(languageTokenMapping.keys())
    inputLanguage, targetLanguage = np.random.choice(languages, 2, False)

    # Translate batch
    inputText = translations[inputLanguage]
    targetText = translations[targetLanguage]

    if not inputText or not targetText:
        return None
    
    inputTokenIDs = encodeInput(inputText, targetLanguage, tokenizer, sequenceLength, languageTokenMapping)
    targetTokenIDs = encodeTarget(targetText, tokenizer, sequenceLength)

    return inputTokenIDs, targetTokenIDs

# Set up for batch training

def transformBatch(batch, languageTokenMapping, tokenizer):
    
    inputs, targets = [],[]

    for tranlationPair in batch['translation']:
        formattedData = formatTranslationData(tranlationPair, languageTokenMapping, tokenizer, max_seq_len)

        if formattedData is None:
            continue

        inputIDs, targetIDs = formattedData
        inputs.append(inputIDs.unsqueeze(0))
        targets.append(targetIDs.unsqueeze(0))
    
    batchInputIDs = torch.cat(inputs).cuda()
    batchTargetIDs = torch.cat(targets).cuda()

    return batchInputIDs, batchTargetIDs

def getDataGenerator(dataset, languageTokenMapping, tokenizer, batchSize=32):
    
    dataset = dataset.shuffle()

    for i in range(0, len(dataset), batchSize):
        batchRaw = dataset[i:i+batchSize]
        yield transformBatch(batchRaw, languageTokenMapping, tokenizer)

In [12]:
inIDs, outIDs = formatTranslationData(
    trainDataset[1]['translation'], languageTokenMapping, tokenizer
)

print(' '.join(tokenizer.convert_ids_to_tokens(inIDs.squeeze())))
print(' '.join(tokenizer.convert_ids_to_tokens(outIDs.squeeze())))

dataGen = getDataGenerator(trainDataset, languageTokenMapping, tokenizer, 32)
dataBatch = next(dataGen)
print(f'Input Shape: {dataBatch[0].shape}')
print(f'Output Shape: {dataBatch[1].shape}')

<viet> ▁Si ▁Andrea ▁Masi ▁ang ▁nag simula ▁na ▁maka punt os ▁sa ▁Italy a ▁sa ▁ ika - apat ▁na ▁minuto ▁ng ▁la ro . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

## Training

In [13]:
nEpochs = 50
batchSize = 8
printFrequency = 50
learningRate = 5e-5
nBatches = int(np.ceil(len(trainDataset) / batchSize))
totalSteps = nEpochs * nBatches
nWarmupSteps = int(totalSteps * 0.01)

In [14]:
# Set up optimizer and learning scheduler

optimizer = AdamW(model.parameters(), lr=learningRate)
scheduler = get_linear_schedule_with_warmup(optimizer, nWarmupSteps, totalSteps)
losses = []



In [15]:
for epochIdx in range(nEpochs):
    # Randomize order of data
    dataGenerator = getDataGenerator(trainDataset, languageTokenMapping, tokenizer, batchSize)

    for batchIdx, (inputBatch, labelBatch) in tqdm.tqdm(enumerate(dataGenerator), total=nBatches):

        optimizer.zero_grad()

        # Forward Pass
        # Loss is calculated in the model
        print(inputBatch.squeeze().shape, labelBatch.shape)
        modelOut = model.forward(input_ids=inputBatch.squeeze(), labels=labelBatch.squeeze())

        loss = modelOut.loss
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Print Training Info
        if (batchIdx +1) % printFrequency == 0:
            averageLoss = np.mean(losses[-printFrequency:])
            print(f'Epoch: {epochIdx + 1} | Step: {batchIdx + 1} | Average Loss: {averageLoss:.3f} | Learning Rate: {scheduler.get_last_lr()[0]}')

  0%|          | 0/2261 [00:00<?, ?it/s]

torch.Size([8, 20]) torch.Size([8, 1, 20])


  0%|          | 0/2261 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 734.00 MiB (GPU 0; 10.00 GiB total capacity; 8.88 GiB already allocated; 0 bytes free; 9.14 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF