# Setup

In [None]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece] nltk 'gem-metrics @ git+https://github.com/GEM-benchmark/GEM-metrics.git' bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gem-metrics@ git+https://github.com/GEM-benchmark/GEM-metrics.git
  Cloning https://github.com/GEM-benchmark/GEM-metrics.git to /tmp/pip-install-6vs59dfu/gem-metrics_d4818c0369c94bd2977d410065b073e9
  Running command git clone -q https://github.com/GEM-benchmark/GEM-metrics.git /tmp/pip-install-6vs59dfu/gem-metrics_d4818c0369c94bd2977d410065b073e9
Collecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 13.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 55.2 MB/s 
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 40.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86

In [None]:
import numpy as np
import os
from tqdm.auto import tqdm
from typing import List
import pandas as pd
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Text2TextGenerationPipeline, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import MarianTokenizer, MarianModel, MarianMTModel
import datasets 
from datasets import Dataset, load_metric, concatenate_datasets
from transformers.pipelines.pt_utils import KeyDataset
import gem_metrics
import nltk

nltk.download('punkt')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


device: cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/MT_final_project')

Mounted at /content/drive


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


# Load Models

In [None]:
# Paths for all models to be used
model_names = ['Helsinki-NLP/opus-mt-da-en', 'MT_da_en_LargeMixTrained3Epochs', 'MT_da_en_3e_CL50_1epoch', 'MT_da_en_CL1_1epoch']

ensemble1 = []

models_directory = os.path.join(os.getcwd(), 'trained_models')
model_paths = [os.path.join(models_directory, model_name) if not 'Helsinki' in model_name else model_name for model_name in model_names]

In [None]:
# Load all pretrained models and tokenizers
models = [{'name': model_path, 
           'model': AutoModelForSeq2SeqLM.from_pretrained(model_path),
           'tokenizer': AutoTokenizer.from_pretrained(model_path)} 
          for model_path in model_paths]

Downloading:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/300M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/788k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

In [None]:
def update_model_tok(model, tokenizer):       
    special_tokens_dict = {'additional_special_tokens': ['[CONV]']}
    tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

In [None]:
# Update tokenizer/models embeddings for special token
for model_dict in models:
    if len(model_dict['tokenizer']) != 58931:
        update_model_tok(model_dict['model'], model_dict['tokenizer'])        

# Load Dataset

In [None]:
data = datasets.load_from_disk(os.path.join(os.getcwd() + '/data/TED_prepended'))['dev']
TEDDev = datasets.load_from_disk(os.path.join(os.getcwd() + '/data/TED2020'))['dev']
TEDTest = datasets.load_from_disk(os.path.join(os.getcwd() + '/data/TED_prepended'))['test']

# Generate Synthetic Dataset

In [None]:
def removeNewLine(examples):
  vals=[]
  for e in examples:
    vals.append(e.replace('\n', ''))
  return vals

In [None]:
def preprocess(batch, tokenizer):
  src = tokenizer(batch['da'], 
                              padding='max_length', 
                              return_tensors='pt',
                              truncation=True,
                              max_length=180, 
                              return_attention_mask=True,
                              pad_to_max_length = True
                              )
  trg = tokenizer(batch['en'], 
                              padding='max_length', 
                              return_tensors='pt',
                              truncation=True,
                              max_length=180,
                              )
  
  src['labels'] = trg['input_ids']
  #src['decoder_attention_mask'] = trg['attention_mask']
  return src

In [None]:
def generate(pipeline, data):
  preds = []
  refs = [e for e in data['en']]
  # get predictions
  for sent in tqdm(pipeline(KeyDataset(data, 'da'), batch_size = 32)):
    preds.append(sent)
  
  return preds, refs

def validate(preds , refs):
  predictions = []
  for p in preds:
    predictions.append(p[0]['generated_text'])

  # convert 
  predictions = gem_metrics.texts.Predictions(predictions)
  refs = gem_metrics.texts.References(refs)
  # compute BLEU score
  result = gem_metrics.compute(predictions, refs, metrics_list=['bleu'])
  return result

In [None]:
# Run inference on all models
def synthesize_data(model_list, dataset_name):
    generated_data = pd.DataFrame(columns=['da', 'en', 'model_name', 'attention_mask', 'input_ids', 'labels'])
    dataset = Dataset.from_pandas(generated_data)

    for model_dict in model_list:
        pipe = Text2TextGenerationPipeline(model=model_dict['model'],  
                tokenizer=model_dict['tokenizer'], 
                device=0, 
                batch_size=32, 
                repetition_penalty=3., 
                num_beams=2,
                length_penalty=1.0, 
                early_stopping=True)
        
        preds, refs = generate(pipe, data)

        df_preds = pd.DataFrame({'da': list(map(lambda x: x.strip('[CONV]').strip(), data['da'])), 
                                'en': list(map(lambda x: x[0]['generated_text'], preds)), 
                                'model_name': [model_dict['name'].split('/')[-1]]*len(data)})
        df_preds['en'] = list(map(lambda x: x.replace('▁', ' ').replace('[CONV]', ' ').strip(), df_preds['en']))

        dataset_model = Dataset.from_pandas(df_preds)
        dataset_model = dataset_model.with_format('torch')
        dataset_model = dataset_model.map(lambda x: preprocess(x, model_dict['tokenizer']), batched=True)
        dataset = concatenate_datasets([dataset, dataset_model])

    dataset_name = dataset_name
    dataset_path = os.path.join(os.getcwd(), 'data/ensemble_generated/' + dataset_name)

    dataset.save_to_disk(dataset_path)

In [None]:
synthesize_data(model_list = models, dataset_name = 'BASE_LARGE3E_CL50_CL1')

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/113 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

# Finetune on Synthetic Data

In [None]:
def compute_metrics(eval_preds):
    metric = load_metric("sacrebleu")
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# Model and dataset setup
dataset_path = os.path.join(os.getcwd(), 'data/ensemble_generated/BASE_LARGE3E_CL50_CL1')
best_model_path = '/content/drive/MyDrive/MT_final_project/trained_models/MT_da_en_ensemble4E'

dataset = datasets.load_from_disk(dataset_path)

model = AutoModelForSeq2SeqLM.from_pretrained(best_model_path)
tokenizer = AutoTokenizer.from_pretrained(best_model_path)

if len(tokenizer) != 58931:
    update_model_tok(model, tokenizer)

loading configuration file /content/drive/MyDrive/MT_final_project/trained_models/MT_da_en_ensemble4E/config.json
Model config MarianConfig {
  "_name_or_path": "/content/drive/MyDrive/MT_final_project/trained_models/MT_da_en_ensemble4E",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_dropout": 0.0,
  "bad_words_ids": [
    [
      58929
    ]
  ],
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 512,
  "decoder_attention_heads": 8,
  "decoder_ffn_dim": 2048,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 58929,
  "decoder_vocab_size": 58931,
  "dropout": 0.1,
  "encoder_attention_heads": 8,
  "encoder_ffn_dim": 2048,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "id2label": {
    "0": "LABEL_0",
    "1": "

In [None]:
batch_size = 32
model_name = 'MT_da_en_ensemble4E'

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

args = Seq2SeqTrainingArguments(
    f"{model_name}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=0,
    save_strategy = 'no',
    num_train_epochs=4,
    predict_with_generate=True,
    fp16 = True,
    push_to_hub = False)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dataset,
    eval_dataset=TEDDev,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: da, en, model_name. If da, en, model_name are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14424
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1804
  Number of trainable parameters = 74311168


Epoch,Training Loss,Validation Loss
1,No log,No log
2,0.039600,No log
3,0.030400,No log
4,0.026900,No log


The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: da, en. If da, en are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 0
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: da, en. If da, en are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 0
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: da, en. If da, en are not expected by `MarianMTModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 0
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `MarianMTModel.forward`

TrainOutput(global_step=1804, training_loss=0.03113527044224369, metrics={'train_runtime': 703.7989, 'train_samples_per_second': 81.978, 'train_steps_per_second': 2.563, 'total_flos': 2750343838433280.0, 'train_loss': 0.03113527044224369, 'epoch': 4.0})

In [None]:
trainer.push_to_hub()

Cloning https://huggingface.co/gur509/MT_da_en_ensemble4E into local empty directory.
Saving model checkpoint to MT_da_en_ensemble4E
Configuration saved in MT_da_en_ensemble4E/config.json
Model weights saved in MT_da_en_ensemble4E/pytorch_model.bin
tokenizer config file saved in MT_da_en_ensemble4E/tokenizer_config.json
Special tokens file saved in MT_da_en_ensemble4E/special_tokens_map.json
added tokens file saved in MT_da_en_ensemble4E/added_tokens.json
Adding files tracked by Git LFS: ['source.spm', 'target.spm']. This may take a bit of time if the files are large.


KeyboardInterrupt: ignored

# Validate

In [None]:
pipe = Text2TextGenerationPipeline(model=model,  
              tokenizer=tokenizer, 
              device=0, 
              batch_size=32, 
              repetition_penalty=3., 
              num_beams=2,
              length_penalty=1.0, 
              early_stopping=True)

In [None]:
preds, refs = generate(pipe, TEDTest)

  0%|          | 0/113 [00:00<?, ?it/s]

In [None]:
preds = list(map(lambda x: x[0]['generated_text'].replace('▁', ' ').replace('[CONV]', ' ').strip(), preds))
refs = list(map(lambda x: x.replace('▁', ' ').replace('[CONV]', ' ').strip(), refs))

In [None]:
def validate_list(preds , refs):
  # convert 
  preds = gem_metrics.texts.Predictions(preds)
  refs = gem_metrics.texts.References(refs)
  # compute BLEU score
  result = gem_metrics.compute(preds, refs, metrics_list=['bleu'])
  return result

In [None]:
result = validate_list(preds, refs)
print(result)

[I 221207 01:30:47 texts:55] Loading predictions for None
[I 221207 01:30:47 texts:55] Loading references for None
[I 221207 01:30:47 __init__:174] Computing BLEU for None...


{'predictions_file': None, 'N': 3606, 'references_file': None, 'bleu': 48.40998}
