In [1]:
%%capture
# !pip install transformers
# !pip install --upgrade accelerate
!pip install datasets evaluate sacrebleu wandb

In [2]:
from google.colab import userdata
hf_token = userdata.get("WRITE")
wb_token = userdata.get("WB_TOKEN")

In [3]:
import wandb

wandb.login(key=wb_token)
wandb.init(
    project="Training Sentence Translation",
    job_type="training",
    anonymous="allow"
)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mneha-shrestha8172[0m ([33mdarvilab[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
from huggingface_hub import login
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
MODEL = "facebook/mbart-large-50-many-to-many-mmt"
DATASET = "Helsinki-NLP/opus-100"
SUBSET = "en-ne"

In [6]:
from datasets import load_dataset
dataset = load_dataset(DATASET, SUBSET)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/93.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/406381 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 406381
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [8]:
dataset["train"][500]

{'translation': {'en': 'Could not save "%s" document to "%s".',
  'ne': 'कागजात "%s" लाई "%s" मा बचत गर्न सकेन ।'}}

In [9]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import DataCollatorForSeq2Seq

model = MBartForConditionalGeneration.from_pretrained(MODEL)
tokenizer = MBart50TokenizerFast.from_pretrained(MODEL)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [10]:
sentence = "कागजात लाई मा बचत गर्न सकेन"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt")

# Print the tokenized input IDs
print("Tokenized input IDs:")
print(inputs["input_ids"].flatten())

# Detokenize the input IDs
detokenized_sentence = tokenizer.decode(inputs["input_ids"].flatten(), skip_special_tokens=True)

# Print the detokenized sentence
print("Detokenized sentence:")
print(detokenized_sentence)

Tokenized input IDs:
tensor([250004, 163689,   2535,      6,   1385,   4592, 169392,   3504, 170647,
             2])
Detokenized sentence:
कागजात लाई मा बचत गर्न सकेन


In [11]:
def tokenize_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["ne"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
dataset["train"][500:501]

{'translation': [{'en': 'Could not save "%s" document to "%s".',
   'ne': 'कागजात "%s" लाई "%s" मा बचत गर्न सकेन ।'}]}

In [13]:
tokenize_function(dataset["train"][500:501])

{'input_ids': [[250004, 191147, 959, 30098, 44, 3949, 7, 58, 12937, 47, 44, 3949, 7, 740, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[250004, 163689, 2535, 44, 3949, 7, 58, 6, 1385, 44, 3949, 7, 58, 4592, 169392, 3504, 170647, 207, 2]]}

In [14]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/406381 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(1000))

In [16]:
small_train_dataset, small_eval_dataset

(Dataset({
     features: ['translation', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 10000
 }),
 Dataset({
     features: ['translation', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 1000
 }))

In [17]:
small_train_dataset[2000], small_eval_dataset[500]

({'translation': {'en': 'System Settings menu', 'ne': 'प्रणाली सेटिङ मेनु'},
  'input_ids': [250004, 12353, 6, 105548, 8026, 2],
  'attention_mask': [1, 1, 1, 1, 1, 1],
  'labels': [250004, 73529, 646, 150054, 9163, 18565, 2]},
 {'translation': {'en': 'Overwrite local note',
   'ne': 'स्थानीय द्रष्टव्य अधिलेखन गर्नुहोस्'},
  'input_ids': [250004, 9578, 434, 18781, 4000, 20537, 2],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1],
  'labels': [250004,
   17631,
   6,
   36754,
   17912,
   42320,
   111033,
   535,
   108523,
   25801,
   2]})

In [18]:
import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [19]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
from accelerate import Accelerator
accelerator = Accelerator()

In [21]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="training_run",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    lr_scheduler_type="linear",
    num_train_epochs=5,
    report_to="wandb",
    push_to_hub=True,
    hub_model_id="NehaShrestha/mbart-en-np-seqtoseq-sentence-translation"
)

In [22]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [23]:
small_train_dataset, small_eval_dataset, trainer = accelerator.prepare(small_train_dataset, small_eval_dataset, trainer)

In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.0147,0.987627,40.1501,9.885
2,0.6038,1.012212,40.728,10.113
3,0.3557,1.080923,35.9297,10.844
4,0.2071,1.150153,40.4318,10.28
5,0.1241,1.189626,40.4595,10.288


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

TrainOutput(global_step=6250, training_loss=0.46106000732421876, metrics={'train_runtime': 5020.8851, 'train_samples_per_second': 9.958, 'train_steps_per_second': 1.245, 'total_flos': 2205545529212928.0, 'train_loss': 0.46106000732421876, 'epoch': 5.0})

In [25]:
trainer.push_to_hub("training end")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


CommitInfo(commit_url='https://huggingface.co/NehaShrestha/mbart-en-np-seqtoseq-sentence-translation/commit/b80a1320b580e8dfe6f19a07540df8813ba33f19', commit_message='training end', commit_description='', oid='b80a1320b580e8dfe6f19a07540df8813ba33f19', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
# checkpoint = "/content/test_run/checkpoint-140"
checkpoint = "NehaShrestha/mbart-en-np-seqtoseq-sentence-translation"
model = MBartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = MBart50TokenizerFast.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

In [27]:
def translator(eng_sentence):
    tokenizer.src_lang = "en_XX"
    encoded_en = tokenizer(eng_sentence, return_tensors="pt")
    generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["ne_NP"])
    decoded_np = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return decoded_np

In [28]:
print(translator("plug-in can only handle grayscale or indexed images"))
print(translator("Short Cut:"))
print(translator("Show Week Numbers"))
print(translator("Entry types and field names configuration"))
print(translator("Open _Location"))

['प्लगइनले ग्रेस्केल वा अनुक्रमण गरिएका छवि मात्र ह्यान्डल गर्न सक्दछ']
['सटकर्ट:']
['सप्ताहका संख्याहरु देखाऊ']
['प्रविष्टि प्रकार र फाँट नाम कन्फिगरेसन']
['स्थान खोल्नुहोस्']


In [32]:
print(translator("What is your name?"))

['तपाईँको नाम?']


In [35]:
print(translator("I don't speak Nepali well."))

['म राम्रोसँग नेपाली बोल्न सक्तिनँ।']
