In [40]:
!pip install datasets transformers sentencepiece sacrebleu==1.5.1

Collecting sacrebleu==1.5.1
  Downloading sacrebleu-1.5.1-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 1.8 MB/s 
[?25hCollecting portalocker==2.0.0
  Downloading portalocker-2.0.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: portalocker, sacrebleu
  Attempting uninstall: portalocker
    Found existing installation: portalocker 2.3.2
    Uninstalling portalocker-2.3.2:
      Successfully uninstalled portalocker-2.3.2
  Attempting uninstall: sacrebleu
    Found existing installation: sacrebleu 2.0.0
    Uninstalling sacrebleu-2.0.0:
      Successfully uninstalled sacrebleu-2.0.0
Successfully installed portalocker-2.0.0 sacrebleu-1.5.1


In [1]:
from datasets import load_dataset, load_metric

In [2]:
en_sr_train = load_dataset("setimes", "en-sr", split='train[0:10000]')
en_sr_val = load_dataset("setimes", "en-sr", split='train[10000:12500]')
en_sr_test = load_dataset("setimes", "en-sr", split='train[12500:15000]')

Reusing dataset setimes (/root/.cache/huggingface/datasets/setimes/en-sr/1.0.0/5b0222bb707caa9d423c61813ef94861e1ccdf82fa4b0bdf4a98de3c9fd33d0d)
Reusing dataset setimes (/root/.cache/huggingface/datasets/setimes/en-sr/1.0.0/5b0222bb707caa9d423c61813ef94861e1ccdf82fa4b0bdf4a98de3c9fd33d0d)
Reusing dataset setimes (/root/.cache/huggingface/datasets/setimes/en-sr/1.0.0/5b0222bb707caa9d423c61813ef94861e1ccdf82fa4b0bdf4a98de3c9fd33d0d)


In [31]:
en_sr_train

Dataset({
    features: ['id', 'translation'],
    num_rows: 10000
})

In [32]:
en_sr_val

Dataset({
    features: ['id', 'translation'],
    num_rows: 2500
})

In [33]:
en_sr_test

Dataset({
    features: ['id', 'translation'],
    num_rows: 2500
})

In [34]:
en_sr_train["translation"][:2]

[{'en': "Kosovo's privatisation process is under scrutiny",
  'sr': 'Proces privatizacije na Kosovu pod lupom'},
 {'en': 'Kosovo is taking a hard look at its privatisation process in light of recurring complaints.',
  'sr': 'Kosovo ozbiljno analizira svoje procese privatizacije u svetlu čestih pritužbi.'}]

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-uk-en")

In [4]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "sr"
target_lang = "en"

def preprocess_function(sentences):
    inputs = [prefix + sentence[source_lang] for sentence in sentences["translation"]]
    targets = [sentence[target_lang] for sentence in sentences["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [35]:
preprocess_function(en_sr_train[:2])

{'input_ids': [[7447, 2699, 11, 1984, 2061, 1528, 13865, 7597, 3395, 4037, 341, 42, 2962, 6187, 4928, 1528, 1211, 8524, 174, 4173, 2474, 5087, 0], [6187, 4928, 6432, 2596, 1372, 371, 2189, 4037, 2182, 96, 612, 8592, 395, 2058, 701, 6432, 4037, 341, 6596, 2699, 3119, 1984, 2061, 1528, 13865, 7597, 3395, 4037, 341, 10617, 701, 281, 38, 6531, 42, 50665, 2762, 395, 1374, 1984, 21261, 1211, 60730, 371, 395, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[56759, 7, 11, 36994, 6417, 38, 39035, 2354, 23, 529, 42359, 0], [56759, 23, 1180, 13, 875, 434, 84, 324, 36994, 6417, 38, 39035, 2354, 14, 1025, 8, 41418, 29918, 3, 0]]}

In [5]:
tokenized_train = en_sr_train.map(preprocess_function, batched=True)
tokenized_val = en_sr_val.map(preprocess_function, batched=True)
tokenized_test = en_sr_test.map(preprocess_function, batched=True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/setimes/en-sr/1.0.0/5b0222bb707caa9d423c61813ef94861e1ccdf82fa4b0bdf4a98de3c9fd33d0d/cache-082ae2b030b7f85d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/setimes/en-sr/1.0.0/5b0222bb707caa9d423c61813ef94861e1ccdf82fa4b0bdf4a98de3c9fd33d0d/cache-0d043166226cb442.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/setimes/en-sr/1.0.0/5b0222bb707caa9d423c61813ef94861e1ccdf82fa4b0bdf4a98de3c9fd33d0d/cache-bee70e3a3f0c098a.arrow


In [6]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-uk-en")

In [7]:
batch_size = 8
model_name = "opus-mt-uk-en"

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True    
)

metric = load_metric("sacrebleu")
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [8]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {
        "bleu": result["score"] 
    }

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]

    result["gen_len"] = np.mean(prediction_lens)
    result = {
        k: round(v, 4) for k, v in result.items()
    }
    return result

In [9]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MarianMTModel.forward` and have been ignored: translation, id.
***** Running training *****
  Num examples = 10000
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12500


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,3.2613,2.725676,16.3911,30.8872
2,2.4053,2.365355,22.2352,30.018
3,2.0708,2.207114,25.2965,29.5444
4,1.7543,2.118623,27.2393,29.0004
5,1.5938,2.067059,28.4796,29.0072
6,1.4551,2.036848,29.3827,28.8744
7,1.3499,2.023442,29.8416,29.0016
8,1.2621,2.014832,30.1606,28.7732
9,1.2378,2.008014,30.2094,28.7872
10,1.1978,2.006981,30.35,28.7576


Saving model checkpoint to opus-mt-uk-en-finetuned-sr-to-en/checkpoint-500
Configuration saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-500/config.json
Model weights saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-500/pytorch_model.bin
tokenizer config file saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-500/tokenizer_config.json
Special tokens file saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-500/special_tokens_map.json
Saving model checkpoint to opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000
Configuration saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/config.json
Model weights saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/tokenizer_config.json
Special tokens file saved in opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `MarianMTMode

TrainOutput(global_step=12500, training_loss=1.7997833544921875, metrics={'train_runtime': 8893.704, 'train_samples_per_second': 11.244, 'train_steps_per_second': 1.405, 'total_flos': 3162658125643776.0, 'train_loss': 1.7997833544921875, 'epoch': 10.0})

In [47]:
from transformers import MarianMTModel, MarianTokenizer

# src_text = ['Ovaj korak jedna je od mera koje Turska planira da sprovede nakon nedavnog porasta napada koje su izveli teroristi povezani sa zabranjenom Kurdistanskom radničkom partijom.']
src_text = ['Turska priprema amandmane na antiteroristički zakon']
model_name = 'opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000'

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
[tokenizer.decode(t, skip_special_tokens=True) for t in translated]

Didn't find file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/added_tokens.json. We won't load it.
Didn't find file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/tokenizer.json. We won't load it.
loading file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/source.spm
loading file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/target.spm
loading file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/vocab.json
loading file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/tokenizer_config.json
loading file None
loading file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/special_tokens_map.json
loading file None
loading configuration file opus-mt-uk-en-finetuned-sr-to-en/checkpoint-1000/config.json
Model config MarianConfig {
  "_name_or_path": "Helsinki-NLP/opus-mt-uk-en",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "swish",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "MarianMTModel"
  ],
  "attention_drop

["Turkey's Prime Minister at Anterrorist Zone"]

In [45]:
en_sr_test["translation"][:5]

[{'en': 'Turkey Preparing Amendments to Anti-Terror Law',
  'sr': 'Turska priprema amandmane na antiteroristički zakon'},
 {'en': "Turkey will submit amendments to its anti-terrorism law to parliament this autumn, the country's justice minister said this week.",
  'sr': 'Turska vlada će ove jeseni podneti parlamentu amandmane na antiteroristički zakon zemlje, saopštio je ove nedelje turski ministar pravosuđa.'},
 {'en': "The move is one of the measures Turkey plans following a recent surge in incidents by terrorists affiliated with the banned Kurdistan Workers' Party.",
  'sr': 'Ovaj korak jedna je od mera koje Turska planira da sprovede nakon nedavnog porasta napada koje su izveli teroristi povezani sa zabranjenom Kurdistanskom radničkom partijom.'},
 {'en': '(The New York Times, Zaman, Xinhua, Journal of Turkish Weekly - 22/07/05; AFP, Reuters, AP, Turkish Press, UPI, Journal of Turkish Weekly - 21/07/05)',
  'sr': '(Njujork Tajms, Zaman, Sinhua, Turski nedeljni žurnal - 22/07/05; AF

In [11]:
# /content/opus-mt-uk-en-finetuned-sr-to-en
!zip -r opus-mt-uk-en-finetuned-sr-to-en.zip opus-mt-uk-en-finetuned-sr-to-en/

  adding: opus-mt-uk-en-finetuned-sr-to-en/ (stored 0%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/ (stored 0%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/tokenizer_config.json (deflated 40%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/vocab.json (deflated 78%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/pytorch_model.bin (deflated 7%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/training_args.bin (deflated 49%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/optimizer.pt (deflated 8%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/config.json (deflated 60%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/trainer_state.json (deflated 81%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/special_tokens_map.json (deflated 34%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12000/target.spm (deflated 51%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpo

In [12]:
!zip -r opus-mt-uk-en-finetuned-sr-to-en_checkpoint-12500.zip opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/

  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/ (stored 0%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/tokenizer_config.json (deflated 40%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/vocab.json (deflated 78%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/pytorch_model.bin (deflated 7%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/training_args.bin (deflated 49%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/optimizer.pt (deflated 8%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/config.json (deflated 60%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/trainer_state.json (deflated 81%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/special_tokens_map.json (deflated 34%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/target.spm (deflated 51%)
  adding: opus-mt-uk-en-finetuned-sr-to-en/checkpoint-12500/source.spm (deflated 57%)
  adding: opus-mt-uk