## Preparation

In [None]:
%pip install sentencepiece
%pip install datasets

from transformers import MarianMTModel, MarianTokenizer, MarianConfig
import nltk
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu

tokenizer_de_en = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")
model_de_en = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-de-en")

import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model_de_en.to(device)

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m


In [None]:
from datasets import load_dataset
import pandas as pd
data = load_dataset("bentrevett/multi30k")
# data = load_dataset("wmt14", 'de-en')
# data = load_dataset("thesistranslation/distilled-ccmatrix-de-en", split='train[:50%]') # 30M
# data = load_dataset("yhavinga/ccmatrix", "de-en", streaming=True)
# wmt-19 36M, wmt-18 42M
# train_data, valid_data, test_data = pd.DataFrame(multi30k['train']), pd.DataFrame(multi30k['validation']), pd.DataFrame(multi30k['test'])
# dataset = 'ccmatrix'
dataset = 'multi30k'
train_data, valid_data, test_data = data['train'], data['validation'], data['test']
from torch.utils.data import TensorDataset, DataLoader
torch.manual_seed(1)

Downloading readme:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.60M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/164k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/156k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

<torch._C.Generator at 0x7d7258714510>

## Pipeline to create: Teacher model outputs (references for student model)

In [None]:
def output_translation_texts(data, model=model_de_en, tokenizer=tokenizer_de_en, set_name='train', src="de", trg="en"):
    tokenized_data = tokenizer(list(data[src]), text_target=list(data[trg]), return_tensors="pt", padding=True, truncation=True, add_special_tokens=True).to(device)
    data_set = TensorDataset(tokenized_data.input_ids, tokenized_data.attention_mask, tokenized_data.labels)
    data_loader = DataLoader(data_set, batch_size=64)
    model.eval()
    output_sentences = []
    i = 0
    for batch in data_loader:
        i+=1
        if i % 10 == 0:
            print(i, '/', len(data_loader))
        output_ids = model.generate(
            input_ids = batch[0].to(device),
            attention_mask = batch[1].to(device),
            labels = batch[2].to(device))
        output_sentences.extend(tokenizer.batch_decode(output_ids, skip_special_tokens=True))
    print("Successfully generate translations for {} data, src: {}, trg: {}. ".format(set_name, src, trg))
    # torch.save(output_sentences, f'Teacher_Translations_{set_name}_{src}_{trg}.pth')
    return output_sentences

## Pipeline to create: QE metrics for teacher model outputs

In [None]:
import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

### BLEU scores

In [None]:
def map_bleu_scores_to_tokens(bleu_scores, category_num = 5):
    if category_num == 5:
        categories_ranges = list(range(20, 110, 20)) # [20, 40, 60, 80, 100]
    else:
        categories_ranges = list(range(5, 110, 10)) # [5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 105]

    BLEU_tokens = []

    for bleu_score in bleu_scores:
        for i, upper_bound in enumerate(categories_ranges):
            if bleu_score <= upper_bound:
                if category_num == 5:
                    BLEU_tokens.append(f'bleu_{upper_bound-20}_{upper_bound}')
                else:
                    BLEU_tokens.append(f'bleu_{upper_bound-5}')
                break

    return BLEU_tokens



In [None]:
def BLEU_for_teacher_translations(translations, data, model=model_de_en, tokenizer=tokenize_en, set_name='train', src="de", trg="en", load_bleu=False):
    references = data['en']
    if load_bleu:
        bleu_scores = torch.load('bleu_scores_r4_{}'.format(set_name))
    else:
        hypotheses = [tokenizer(s) for s in translations]
        references = [tokenizer(s) for s in references]

        bleu_scores = []
        for i, hypothesis in enumerate(hypotheses):
            bleu_score = sentence_bleu(references=[references[i]], hypothesis=hypothesis)
            bleu_scores.append(round(bleu_score * 100, 4))
        # torch.save(bleu_scores, 'bleu_scores_r4_{}_{}_{}'.format(set_name, src, trg))
        # bleu_scores = torch.Tensor(bleu_scores) * 100
        # bleu_tokens_c5 = map_bleu_scores_to_tokens(bleu_scores)
        # torch.save(bleu_tokens_c5, 'bleu_tokens_c5_{}_{}_{}'.format(set_name, src, trg))
        bleu_tokens_c10 = map_bleu_scores_to_tokens(bleu_scores, 10)
        # torch.save(bleu_tokens_c10, 'bleu_tokens_c10_{}_{}_{}'.format(set_name, src, trg))

    return bleu_scores, bleu_tokens_c10

In [None]:
# train_outputs = output_translation_texts(train_data)
train_bleu = BLEU_for_teacher_translations(train_outputs, train_data)
# test_outputs = output_translation_texts(test_data, set_name="test")
test_bleu = BLEU_for_teacher_translations(test_outputs, test_data)
# valid_outputs = output_translation_texts(valid_data, set_name="validation")
valid_bleu = BLEU_for_teacher_translations(valid_outputs, valid_data)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
import numpy as np
sorted_bleu_TR = sorted(train_bleu[0])
sub_lists = np.array_split(sorted_bleu_TR, 10)
avg_list = []
bounds = []
for i, sublist in enumerate(sub_lists):
    avg = round(np.mean(sublist))
    avg_list.append(avg)
    bounds.append(sublist[-1])
    print(f"Avg value list {i + 1}: {avg}, bound: {sublist[-1]}")

Avg value list 1: 0, bound: 0.0
Avg value list 2: 0, bound: 0.0
Avg value list 3: 0, bound: 0.0
Avg value list 4: 14, bound: 25.3365
Avg value list 5: 29, bound: 32.4668
Avg value list 6: 36, bound: 39.1654
Avg value list 7: 43, bound: 46.8257
Avg value list 8: 51, bound: 54.6276
Avg value list 9: 61, bound: 66.7508
Avg value list 10: 81, bound: 100.0


In [None]:
src, trg = 'de', 'en'
torch.save({'train': train_outputs, 'test': test_outputs, 'validation': valid_outputs}, f'Teacher_Translations_{dataset}_{src}_{trg}.pth')
torch.save({'train': train_bleu[0], 'test': test_bleu[0], 'validation': valid_bleu[0]}, f'bleu_scores_r4_{dataset}_{src}_{trg}.pth')
torch.save({'train':train_bleu[1], 'test': test_bleu[1], 'validation': valid_bleu[1]}, f'bleu_tokens_c10_{dataset}_{src}_{trg}.pth')
# torch.save({'train':train_bleu[2], 'test': test_bleu[2], 'validation': valid_bleu[2]}, f'bleu_tokens_c10_{dataset}_{src}_{trg}.pth')