In [None]:
%%capture
# !pip install transformers
# !pip install --upgrade accelerate
!pip install datasets evaluate sacrebleu

In [None]:
# MODEL = "google/flan-t5-small"
# MODEL = "facebook/bart-base"
MODEL = "facebook/bart-large"
DATASET = "Helsinki-NLP/opus-100"
SUBSET = "en-ne"

In [None]:
from datasets import load_dataset
dataset = load_dataset(DATASET, SUBSET)

In [None]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 406381
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

In [None]:
dataset["train"][500]

{'translation': {'en': 'Could not save "%s" document to "%s".',
  'ne': 'कागजात "%s" लाई "%s" मा बचत गर्न सकेन ।'}}

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [None]:
sentence = "कागजात लाई मा बचत गर्न सकेन"

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt")

# Print the tokenized input IDs
print("Tokenized input IDs:")
print(inputs["input_ids"].flatten())

# Detokenize the input IDs
detokenized_sentence = tokenizer.decode(inputs["input_ids"].flatten(), skip_special_tokens=True)

# Print the detokenized sentence
print("Detokenized sentence:")
print(detokenized_sentence)

Tokenized input IDs:
tensor([    0, 14238, 15722, 35636, 14238,  6800, 14238,    48, 35636, 14238,
        10470, 22214, 14292, 35636, 14238, 23133, 22214,  2840, 35636, 22214,
        11582, 14238, 15113, 14238, 10470, 22214,  6800, 14238,  7487, 22919,
         8384, 14238, 11423, 22214, 18537, 14238, 15722, 22919,  6382, 14238,
        11423,     2])
Detokenized sentence:
कागजात लाई मा बचत गर्न सकेन


In [None]:
# instruction = "Translate English to Nepali: "

def tokenize_function(examples):
    inputs = [example["en"] for example in examples["translation"]]
    targets = [example["ne"] for example in examples["translation"]]
    return tokenizer(inputs, text_target=targets, max_length=1024, truncation=True)

In [None]:
dataset["train"][500:501]

{'translation': [{'en': 'Could not save "%s" document to "%s".',
   'ne': 'कागजात "%s" लाई "%s" मा बचत गर्न सकेन ।'}]}

In [None]:
tokenize_function(dataset["train"][500:501])

{'input_ids': [[0, 35299, 45, 1871, 49608, 29, 113, 3780, 7, 49608, 29, 845, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[0, 14238, 15722, 35636, 14238, 6800, 14238, 48, 35636, 14238, 10470, 49608, 29, 113, 22214, 14292, 35636, 14238, 23133, 49608, 29, 113, 22214, 2840, 35636, 22214, 11582, 14238, 15113, 14238, 10470, 22214, 6800, 14238, 7487, 22919, 8384, 14238, 11423, 22214, 18537, 14238, 15722, 22919, 6382, 14238, 11423, 1437, 22919, 10470, 2]]}

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/406381 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
small_train_dataset = tokenized_dataset["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = tokenized_dataset["test"].shuffle(seed=42).select(range(10))

In [None]:
small_train_dataset, small_eval_dataset

(Dataset({
     features: ['translation', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 100
 }),
 Dataset({
     features: ['translation', 'input_ids', 'attention_mask', 'labels'],
     num_rows: 10
 }))

In [None]:
small_train_dataset[10], small_eval_dataset[9]

({'translation': {'en': 'Show Week Numbers', 'ne': 'सप्ताहका संख्याहरु देखाऊ'},
  'input_ids': [0, 27477, 2852, 31415, 2],
  'attention_mask': [1, 1, 1, 1, 1],
  'labels': [0,
   14238,
   18537,
   14238,
   10278,
   22919,
   8384,
   14238,
   10470,
   35636,
   14238,
   9253,
   14238,
   15722,
   35636,
   22214,
   18537,
   14238,
   9264,
   14238,
   25448,
   22919,
   8384,
   14238,
   10965,
   35636,
   14238,
   9253,
   14238,
   7487,
   22919,
   10172,
   22214,
   18164,
   22919,
   6382,
   14238,
   25448,
   35636,
   14238,
   27969,
   2]},
 {'translation': {'en': 'Open _Location', 'ne': 'स्थान खोल्नुहोस्'},
  'input_ids': [0, 25266, 18134, 46571, 2],
  'attention_mask': [1, 1, 1, 1, 1],
  'labels': [0,
   14238,
   18537,
   22919,
   8384,
   14238,
   8210,
   35636,
   14238,
   11423,
   22214,
   25448,
   22919,
   13859,
   14238,
   14292,
   22919,
   8384,
   14238,
   11423,
   22919,
   10172,
   14238,
   9253,
   22919,
   13859,
   14238,
 

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args=Seq2SeqTrainingArguments(
    output_dir="test_run",
    eval_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=3,
    predict_with_generate=True,
    lr_scheduler_type="linear",
    num_train_epochs=100,
    #push_to_hub=True
)

In [None]:
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.3455,1.908928,0.0,20.0
2,2.0227,1.639604,0.0,20.0
3,1.7849,1.540386,0.0,20.0
4,1.7429,1.536173,0.0,20.0
5,1.6634,1.486303,0.0,19.6
6,1.5942,1.439471,0.0,20.0
7,1.4228,1.471516,0.0,20.0
8,1.4317,1.436419,0.0,20.0
9,1.4758,1.422416,0.0,20.0
10,1.3494,1.430479,0.0,20.0


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=700, training_loss=0.48115638962813784, metrics={'train_runtime': 697.2879, 'train_samples_per_second': 14.341, 'train_steps_per_second': 1.004, 'total_flos': 364547627286528.0, 'train_loss': 0.48115638962813784, 'epoch': 100.0})

In [None]:
from transformers import pipeline
checkpoint = "/content/test_run/checkpoint-700"
translator = pipeline("translation", model=checkpoint, device="cuda")

In [None]:
print(translator("plug-in can only handle grayscale or indexed images"))
print(translator("Short Cut:"))
print(translator("Show Week Numbers"))
print(translator("Entry types and field names configuration"))
print(translator("Open _Location"))

[{'translation_text': 'plug-in can only handle grayscale or indexed images स्�'}]
[{'translation_text': 'सटकर्ट:'}]
[{'translation_text': 'सप्ताहका स�'}]
[{'translation_text': 'संस्करण न�'}]
[{'translation_text': 'समावट फोल्'}]


In [12]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

article_en = "My name is Norden Ghising Tamang"

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

tokenizer.src_lang = "en_XX"
encoded_en = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["ne_NP"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['मेरो नाम नोर्डेन गिसिंग तामानग हो']