In [1]:
import torch
from torch.utils.data import Dataset
from transformers import T5TokenizerFast, T5ForConditionalGeneration, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import datasets

  from .autonotebook import tqdm as notebook_tqdm
2023-11-07 02:55:52.041775: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-07 02:55:52.041817: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-07 02:55:52.045705: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [3]:
model_downloaded = True
model_name = "t5-small"
cache_dir = "./cache/" + model_name
model_save_dir = "./models/" + model_name

if model_downloaded:
    model_name = "./models/" + model_name

In [4]:
model = T5ForConditionalGeneration.from_pretrained(model_name, cache_dir=cache_dir).to(device)
tokenizer = T5TokenizerFast.from_pretrained(model_name, cache_dir=cache_dir)

In [5]:
dataset_downloaded = False
dataset_name = "opus100"
cache_dir = "./cache/" + dataset_name

if dataset_downloaded:
    dataset_name = "./datasets/" + dataset_name
    dataset = datasets.load_from_disk(dataset_name)
else:
    dataset = datasets.load_dataset(dataset_name, "en-ur", cache_dir=cache_dir)

In [6]:
train_val_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
train_val_dataset["validation"] = train_val_dataset.pop("test")

In [7]:
del dataset

In [8]:
ur_sentences = [obj['ur'] for obj in train_val_dataset["train"]["translation"]]
en_sentences = [obj['en'] for obj in train_val_dataset["train"]["translation"]]

In [9]:
all_sentences = ur_sentences + en_sentences

In [10]:
def get_training_corpus():
    return (
        all_sentences[i : i + 1000]
        for i in range(0, len(all_sentences), 1000)
    )

In [11]:
training_corpus = get_training_corpus()

In [12]:
old_tokenizer.vocab_size

32100

In [13]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=old_tokenizer.vocab_size)





In [14]:
del all_sentences, ur_sentences, en_sentences, training_corpus

In [8]:
def preprocess(data, is_target=False):
    if is_target:
        data = [x['ur'] for x in data]
        with tokenizer.as_target_tokenizer():
            return tokenizer(data, padding=True, truncation=True, return_tensors="pt")
    else:
        data = [x['en'] for x in data]
        return tokenizer(data, padding=True, truncation=True, return_tensors="pt")

In [9]:
train_target_tokens = preprocess(train_val_dataset["train"]["translation"], is_target=True)
train_input_tokens = preprocess(train_val_dataset["train"]["translation"])
val_target_tokens = preprocess(train_val_dataset["validation"]["translation"], is_target=True)
val_input_tokens = preprocess(train_val_dataset["validation"]["translation"])



In [10]:
del train_val_dataset

In [11]:
train_dataset = {
    "input_ids": train_input_tokens["input_ids"],
    "attention_mask": train_input_tokens["attention_mask"],
    "decoder_input_ids": train_target_tokens["input_ids"],
    "decoder_attention_mask": train_target_tokens["attention_mask"],
    "labels": train_target_tokens["input_ids"],
}

val_dataset = {
    "input_ids": val_input_tokens["input_ids"],
    "attention_mask": val_input_tokens["attention_mask"],
    "decoder_input_ids": val_target_tokens["input_ids"],
    "decoder_attention_mask": val_target_tokens["attention_mask"],
    "labels": val_target_tokens["input_ids"],
}

In [12]:
del train_input_tokens, train_target_tokens, val_input_tokens, val_target_tokens

In [13]:
class TranslationDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])


train_dataset = TranslationDataset(train_dataset)
val_dataset = TranslationDataset(val_dataset)

In [14]:
data_collector = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=tokenizer.pad_token_id)

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    weight_decay=0.001,
    learning_rate=3e-6,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=5000,
    save_total_limit=3,
    save_steps=500,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collector,
)

In [16]:
trainer.train("./results/checkpoint-127000")

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=127224, training_loss=9.57378402323286e-05, metrics={'train_runtime': 50.8676, 'train_samples_per_second': 40016.873, 'train_steps_per_second': 2501.08, 'total_flos': 2.1630800566163866e+17, 'train_loss': 9.57378402323286e-05, 'epoch': 3.0})

In [17]:
model.save_pretrained("./models/t5-small/")
tokenizer.save_pretrained("./models/t5-small/")

('./models/t5-small/tokenizer_config.json',
 './models/t5-small/special_tokens_map.json',
 './models/t5-small/tokenizer.json')

In [18]:
def translate(text):
    input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
    outputs = model.generate(input_ids)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


translate("Allah is watching everything.")



'اور اللہ سب کچھ دیکھ رہا ہے'

In [20]:
import huggingface_hub

huggingface_hub.publish_model(
    model_id="t5-small-english-to-urdu",
    model=model,
    tokenizer=tokenizer,
    use_auth_token="",
    repo_url="https://huggingface.co/umair/t5-small-urdu-to-english",
    commit_message="Initial commit",
    private=False,
    push_to_hub=True,
    use_auth_token="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",
)

SyntaxError: keyword argument repeated: use_auth_token (358149734.py, line 12)