In [None]:
! pip install -q sacrebleu sentencepiece transformers datasets

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AdamW

from datasets import load_dataset, load_metric, Dataset, DatasetDict
import sentencepiece as spm

import torch
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd

import os
import time
from itertools import permutations

# Create or Load new tokenizer

In [None]:
def create_tokenizer(sentencepiece_path):
    new_tokenizer_t5 = T5Tokenizer.from_pretrained(sentencepiece_path, extra_ids=0)
    lang_code = ['__af__','__am__','__ar__','__as__','__ast__','__ay__','__az__','__ba__','__be__','__bg__','__bn__','__br__','__bs__','__ca__','__ceb__','__cjk__','__cs__','__cy__','__da__','__de__','__dyu__','__el__','__en__','__es__','__et__','__fa__','__ff__','__fi__','__fr__','__fy__','__ga__','__gd__','__gl__','__gu__','__ha__','__he__','__hi__','__hr__','__ht__','__hu__','__hy__','__id__','__ig__','__ilo__','__is__','__it__','__ja__','__jv__','__ka__','__kac__','__kam__','__kea__','__kg__','__kk__','__km__','__kmb__','__kmr__','__kn__','__ko__','__ku__','__ky__','__lb__','__lg__','__ln__','__lo__','__lt__','__luo__','__lv__','__mg__','__mi__','__mk__','__ml__','__mn__','__mr__','__ms__','__mt__','__my__','__ne__','__nl__','__no__','__ns__','__ny__','__oc__','__om__','__or__','__pa__','__pl__','__ps__','__pt__','__qu__','__ro__','__ru__','__sd__','__shn__','__si__','__sk__','__sl__','__sn__','__so__','__sq__','__sr__','__ss__','__su__','__sv__','__sw__','__ta__','__te__','__tg__','__th__','__ti__','__tl__','__tn__','__tr__','__uk__','__umb__','__ur__','__uz__','__vi__','__wo__','__xh__','__yi__','__yo__','__zh__','__zu__']
    new_tokenizer_t5.add_special_tokens(
        {"additional_special_tokens":lang_code,
          'bos_token': '<s>',
          'sep_token': '</s>'
        })
    return new_tokenizer_t5

In [None]:
try:
    new_tokenizer_t5 = T5Tokenizer.from_pretrained("mt3_tokenizer")
except:
    new_tokenizer_t5 = create_tokenizer('r/flores101_mm100_175M/sentencepiece.bpe.model')
    new_tokenizer_t5.save_pretrained('mt3_tokenizer')

# Process the Data

## Preparing Training Data

In [None]:
training_data = DatasetDict.load_from_disk("dataset_train")
size_single_direction = 20000
batch_size = 32

### Filtering training dataset before tokenization

In [None]:
def preprocess_filter(example):
    # filter difference length
    filter_value = {'en-mk':5,
                    'en-et':6,
                    'et-mk':5,}
    
    lang1 = example["translation"][source_lang]
    lang2 = example["translation"][target_lang]
    filter_val = max(filter_value.get(f'{source_lang}-{target_lang}',0), 
                     filter_value.get(f'{target_lang}-{source_lang}',0))
    filtered = not( (len(lang1) < 3 or len(lang2) < 3 ) and (np.abs(len(lang1) - len(lang2)) > filter_val) )
    return filtered


def create_filtered_dataset(source_lang, target_lang, dataset, preprocessing_func, batch_size, size=None):
    np.random.seed(42)
    language_pair = f'{source_lang}-{target_lang}'
    
    if size is None:
        return dataset[language_pair].filter(preprocessing_func, batch_size=batch_size)
    else:
        return dataset[language_pair].select(np.random.randint(0,len(dataset[language_pair]),size=int(size*1.2))).filter(preprocessing_func, batch_size=batch_size).select(range(size))

#### Pair 1: English and Macedonian

In [None]:
source_lang = 'en'
target_lang = 'mk'

en_mk_filtered = create_filtered_dataset(source_lang=source_lang, 
                                         target_lang=target_lang, 
                                         dataset=training_data, 
                                         preprocessing_func=preprocess_filter, 
                                         batch_size=batch_size,
                                         size=size_single_direction)

#### Pair 2: English and Estonian

In [None]:
source_lang = 'en'
target_lang = 'et'

en_et_filtered = create_filtered_dataset(source_lang=source_lang, 
                                         target_lang=target_lang, 
                                         dataset=training_data, 
                                         preprocessing_func=preprocess_filter, 
                                         batch_size=batch_size,
                                         size=size_single_direction)

#### Pair 3: Estonian and Macedonian

In [None]:
source_lang = 'et'
target_lang = 'mk'

et_mk_filtered = create_filtered_dataset(source_lang=source_lang, 
                                         target_lang=target_lang, 
                                         dataset=training_data, 
                                         preprocessing_func=preprocess_filter, 
                                         batch_size=batch_size,
                                         size=size_single_direction)

#### Combining filtered datasets and creating both-ways translation pairs for training

In [None]:
def create_two_way_dataset(dataset):
    result = []
    for elem in dataset['translation']:
        l1, l2 = elem.keys()
        s1, s2 = elem.values()
        temp = {}
        temp['source_'+l1] = s1
        temp['target_'+l2] = s2
        result.append(temp)
        temp = {}
        temp['source_'+l2] = s2
        temp['target_'+l1] = s1
        result.append(temp)
    return result 

In [None]:
et_mk_filtered_2way = create_two_way_dataset(et_mk_filtered)
en_et_filtered_2way = create_two_way_dataset(en_et_filtered)
en_mk_filtered_2way = create_two_way_dataset(en_mk_filtered)

combined_training = Dataset.from_dict({'translation': et_mk_filtered_2way + en_et_filtered_2way + en_mk_filtered_2way})

## Preparing validation Data

In [None]:
test_data = DatasetDict.load_from_disk("/content/drive/MyDrive/W266/dataset_test")

In [None]:
def create_two_way_dataset_val(dataset):
    result = []
    for elem in dataset['translation']:
        l1, l2, l3 = elem.keys()
        s1, s2, s3 = elem.values()
        
        temp = {}
        temp['source_'+l1] = s1
        temp['target_'+l2] = s2
        result.append(temp)
        
        temp = {}
        temp['source_'+l2] = s2
        temp['target_'+l1] = s1
        result.append(temp)
        
        temp = {}
        temp['source_'+l2] = s2
        temp['target_'+l3] = s3
        result.append(temp)
        
        temp = {}
        temp['source_'+l3] = s3
        temp['target_'+l2] = s2
        result.append(temp)
        
        temp = {}
        temp['source_'+l1] = s1
        temp['target_'+l3] = s3
        result.append(temp)
        
        temp = {}
        temp['source_'+l3] = s3
        temp['target_'+l1] = s1
        result.append(temp)
    return result 

In [None]:
val_2way = create_two_way_dataset_val(test_data['dev'])
combined_validation = Dataset.from_dict({'translation': val_2way})

## Preprocessing: adding appropriate prefix 

In [None]:
def get_prefix(source_lang, target_lang):
    if source_lang == 'en' and target_lang == 'mk':
        return 'translate English to Macedonian: '
    elif source_lang == 'mk' and target_lang == 'en':
        return 'translate Macedonian to English: '
    elif source_lang == 'et' and target_lang == 'mk':
        return 'translate Estonian to Macedonian: '
    elif source_lang == 'mk' and target_lang == 'et':
        return 'translate Macedonian to Estonian: '
    elif source_lang == 'en' and target_lang == 'et':
        return 'translate English to Estonian: '
    else:
        return 'translate Estonian to English: '

def get_prefix_input_output(dict_obj):
    langs, sent = [], []
    for k, v in dict_obj.items():
        if v is not None:
            langs.append(k.split('_')[-1])
            sent.append(v)
    prefix = get_prefix(source_lang=langs[0], target_lang=langs[1])
    return prefix, sent[0], sent[1]

def preprocess_function(examples, max_input_length=50, max_target_length=50):    
    inputs, targets = [], []
    
    # append prefix
    for element in examples['translation']:
        prefix, source_sent, target_sent = get_prefix_input_output(element)
        inputs.append(prefix + source_sent)
        targets.append(target_sent)
    
    # get tokenized input
    model_inputs = new_tokenizer_t5(inputs, max_length=max_input_length, truncation=True, padding='max_length')
    
    # get tokenized target
    with new_tokenizer_t5.as_target_tokenizer():
        labels = new_tokenizer_t5(targets, max_length=max_target_length, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs            

### Applying preprocessing and tokenization on training and validation data

In [None]:
tokenized_trainingset = combined_training.map(preprocess_function, batched=True)
tokenized_devset = combined_validation.map(preprocess_function, batched=True)

### Save to Processed Dataset to Disk

In [None]:
tokenized_trainingset.save_to_disk(f"trainingset_s_{size_single_direction}")
tokenized_devset.save_to_disk(f"devset_s_{size_single_direction}")

# Training Model

In [None]:
# Load Tokenizer
new_tokenizer_t5 = T5Tokenizer.from_pretrained("mt3_tokenizer")

# Load Dataset
size_single_direction = 20000
tokenized_trainingset = Dataset.load_from_disk(f"trainingset_s_{size}")
tokenized_devset = Dataset.load_from_disk(f"tokenized_devset")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = new_tokenizer_t5.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    decoded_labels = new_tokenizer_t5.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"], "gen_len": result['sys_len']/len(decoded_preds),"ref_len": result['ref_len']/len(decoded_preds)}

    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Training the model - overall set-up

### Create new model

In [None]:
def create_model(pt_embedding_model_path, randomize_embedding=True, randomize_weights=True, encoder_decoder='t5-small'):

    # model checkpoints
    model_cp_t5 = 't5-small'
    model_cp_m2m = embedding_model_folder
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Loading the base models
    model_m2m = torch.load(model_cp_m2m, map_location=device)
    model_t5 = AutoModelForSeq2SeqLM.from_pretrained(model_cp_t5)
    
    # Randomize T5 Weights
    if randomize_weights:
        model_t5 = T5ForConditionalGeneration(model_t5.config)
        
    # Update model embedding size
    new_embedding_size = new_tokenizer_t5.vocab_size + len(new_tokenizer_t5.additional_special_tokens) + 1 # padding token
    model_t5.resize_token_embeddings(new_embedding_size)

    # Create new Embedding structure in line with m2m-100 model
    new_t5_embedding = torch.nn.Embedding(new_embedding_size, 512)
    
    # Optional: Extract the embeddings from m2m 100
    if not randomize_embedding:
        m2m_embeddings = model_m2m['model']['encoder.embed_tokens.weight']
        m2m_size = m2m_embeddings.shape[0]
        m2m_embedding_dim = m2m_embeddings.shape[1]

        for i in range(m2m_size):
            new_t5_embedding.weight.data[i] = m2m_embeddings[i]

    # Update the t5 model layers 
    model_t5.shared.weight = new_t5_embedding.weight

    # Update model config
    model_t5.config.__dict__['vocab_size'] = m2m_size
    model_t5.config.__dict__['_name_or_path'] = 'mt3_pt'

    model_t5.config.__dict__['decoder_start_token_id'] = new_tokenizer_t5.bos_token_id
    model_t5.config.__dict__['pad_token_id'] = new_tokenizer_t5.pad_token_id
    model_t5.config.__dict__['bos_token_id'] = new_tokenizer_t5.bos_token_id
    model_t5.config.__dict__['eos_token_id'] = new_tokenizer_t5.eos_token_id
    model_t5.config.__dict__['task_specific_params'] = {
        'translation_en_to_et': {'early_stopping': True,
              'max_length': 300,
              'num_beams': 4,
              'prefix': 'translate English to Estonian: '},

         'translation_en_to_mk': {'early_stopping': True,
              'max_length': 300,
              'num_beams': 4,
              'prefix': 'translate English to Macedonian: '},
         'translation_et_to_en': {'early_stopping': True,

              'max_length': 300,
              'num_beams': 4,
              'prefix': 'translate Estonian to English: '},
         'translation_et_to_mk': {'early_stopping': True,

              'max_length': 300,
              'num_beams': 4,
              'prefix': 'translate Estonian to Macedonian: '},

         'translation_mk_to_en': {'early_stopping': True,
              'max_length': 300,
              'num_beams': 4,
              'prefix': 'translate Macedonian to English: '},

         'translation_mk_to_et': {'early_stopping': True,
              'max_length': 300,
              'num_beams': 4,
              'prefix': 'translate Macedonian to Estonian: '}}
    
    return model_t5

### overall set-up

In [None]:
model_t5 = create_model("r/flores101_mm100_175M/model.pt", randomize_embedding=True, randomize_weights=True)

In [None]:
batch_size = 32
learning_rate = 3e-3

In [None]:
metric = load_metric("sacrebleu")

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="/test-translation",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    logging_strategy = 'epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    num_train_epochs=30,
    load_best_model_at_end = True,
    metric_for_best_model = 'bleu',
    predict_with_generate=True,
    save_strategy= "epoch",
)

In [None]:
data_collator = DataCollatorForSeq2Seq(new_tokenizer_t5, model=model_t5)

In [None]:
trainer = Seq2SeqTrainer(
    model_t5,
    args,
    train_dataset=tokenized_trainingset,
    eval_dataset=tokenized_devset,
    data_collator=data_collator,
    tokenizer=new_tokenizer_t5,
    compute_metrics=compute_metrics,
)

In [None]:
# continuing from cp
train_cp = ""
if train_cp:
    trainer.train(train_cp)    
else:
    trainer.train()

In [None]:
# saving the model
model_t5.save_pretrained("mode2_20k")

# Validation

In [None]:
def extract_translation(examples):
    global target_lang
    inputs = [[ex[target_lang]] for ex in examples["translation"]]
    inputs_50 = [  [new_tokenizer_t5.decode( new_tokenizer_t5( ex[target_lang],max_length=50, truncation=True).input_ids, skip_special_tokens=True) ]  for ex in examples["translation"]]
    return inputs, inputs_50

def score_translations(generalized_file_path, test_dataset_path):
    global source_lang, target_lang
    raw_test = DatasetDict.load_from_disk(test_dataset_path)
    metric = load_metric("sacrebleu")
    bleu_scores = {}
    for source_lang, target_lang in permutations(['en','et','mk'], 2):
        print(source_lang, "-->" ,target_lang)
        # get predictions 
        with open(generalized_file_path.format(source_lang, target_lang), encoding="utf-8") as f:
            pred = [l.replace('\n','') for l in f.readlines() if l != '\n']
        # get reference
        ref, ref_50 = extract_translation(raw_test['devtest'])
        # get score
        bleu = metric.compute(predictions=pred, references=ref)
        bleu_50 = metric.compute(predictions=pred, references=ref_50)

        bleu_scores["{}_{}".format(source_lang, target_lang)] = bleu
        bleu_scores["{}_{}-50".format(source_lang, target_lang)] = bleu_50

    for dire in bleu_scores.keys():
        prec = [np.round(x,5) for x in bleu_scores[dire]['precisions']]
        bleu_scores[dire]['precisions'] = prec

    return pd.DataFrame.from_dict(bleu_scores,'index')

def score_translations_from_model(model, tokenizer):
    global source_lang, target_lang
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    metric = load_metric("sacrebleu")
    bleu_scores = {}

    for source_lang, target_lang in permutations(['en','et','mk'], 2):
        print(source_lang, "-->" ,target_lang)

        # get predictions 
        if os.path.exists(f'generate.{source_lang}.{target_lang}'):
            with open(f'generate.{source_lang}.{target_lang}', encoding="utf-8") as f:
                predictions = [l.replace('\n','') for l in f.readlines() if l != '\n']
        else:

            try:
                temp = DatasetDict.load_from_disk(f"tokenized_testset.{source_lang}.{target_lang}")
            except:
                temp = test_data['devtest'].map(preprocess_function_test, batched=True)
                temp.save_to_disk(f"tokenized_testset.{source_lang}.{target_lang}")

            temp.set_format(type='torch', columns=['input_ids','attention_mask'])
            train_loader = DataLoader(temp, batch_size=batch_size, shuffle=False, **kwargs)

            predictions, labels = [], []
            start = time.time()
            for step, batch in enumerate(train_loader):
                print(step,end='\r')
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                output = model.generate(input_ids, attention_mask=attention_mask) #, forced_bos_token_id=tokenizer.get_lang_id(target_lang) )
                labels += tokenizer.batch_decode(input_ids, skip_special_tokens=True)
                predictions += tokenizer.batch_decode(output, skip_special_tokens=True)
            end = time.time()
            print("time taken: ",end-start)

            with open(f'generate.{source_lang}.{target_lang}','w', encoding='utf-8') as output:
                output.write("\n".join(predictions))

        labels, labels_50 = extract_translation(test_data['devtest'])

        bleu = metric.compute(predictions=predictions, references=labels)
        bleu_50 = metric.compute(predictions=predictions, references=labels_50)

        bleu_scores["{}_{}".format(source_lang, target_lang)] = bleu
        bleu_scores["{}_{}-50".format(source_lang, target_lang)] = bleu_50
        print({k:v['score'] for k,v in bleu_scores.items()})

    for dire in bleu_scores.keys():
        prec = [np.round(x,5) for x in bleu_scores[dire]['precisions']]
        bleu_scores[dire]['precisions'] = prec
    return pd.DataFrame.from_dict(bleu_scores,'index')

def get_prefix(source_lang, target_lang):
    if source_lang == 'en' and target_lang == 'mk':
        return 'translate English to Macedonian: '
    elif source_lang == 'mk' and target_lang == 'en':
        return 'translate Macedonian to English: '
    elif source_lang == 'et' and target_lang == 'mk':
        return 'translate Estonian to Macedonian: '
    elif source_lang == 'mk' and target_lang == 'et':
        return 'translate Macedonian to Estonian: '
    elif source_lang == 'en' and target_lang == 'et':
        return 'translate English to Estonian: '
    else:
        return 'translate Estonian to English: '

def preprocess_function_test(examples):
    max_input_length = 300

    inputs = []
    for element in examples['translation']:
        prefix = get_prefix(source_lang, target_lang)
        inputs.append(prefix + element[source_lang])

    model_inputs = new_tokenizer_t5(inputs, max_length=max_input_length, truncation=True, padding='max_length')
  
    return model_inputs            

In [None]:
# New Tokenizer
new_tokenizer_t5 = T5Tokenizer.from_pretrained("mt3_tokenizer")

In [None]:
# load test set
test_data = DatasetDict.load_from_disk("/content/drive/MyDrive/W266/dataset_test")

## Optional: Loading Model

In [None]:
cp_path = r'test-translation/checkpoint-112500'
model_t5 = AutoModelForSeq2SeqLM.from_pretrained(cp_path)

## Get Score from Model

In [None]:
score_translations_from_model(model_t5, new_tokenizer_t5)