<a href="https://colab.research.google.com/github/MishraShardendu22/Transformers/blob/main/Translate_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall torch -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Found existing installation: torch 2.5.1+cu121
Uninstalling torch-2.5.1+cu121:
  Successfully uninstalled torch-2.5.1+cu121
Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
Installing collected packages: torch


In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

In [None]:
!pip install transformers datasets sentencepiece accelerate evaluate

# Explaination (ai help)

```python
dataset = dataset.train_test_split(test_size=0.1)
```

### What it does

It splits your dataset into two parts:

* **90% → training set**
* **10% → validation (test) set**

Since you selected **30,000 samples**:

* 27,000 → used to train the model
* 3,000 → used to evaluate model performance

---

### Why this is required

During training:

* Model learns on the **train set**
* After each epoch, performance is checked on the **validation set**
* Prevents overfitting
* Lets you measure BLEU score properly

---

### What `print(dataset)` shows

You will see something like:

```
DatasetDict({
    train: Dataset({
        features: ...
        num_rows: 27000
    })
    test: Dataset({
        features: ...
        num_rows: 3000
    })
})
```

In [None]:
from datasets import load_dataset

# Correct dataset name
dataset = load_dataset("cfilt/iitb-english-hindi")

# Shuffle and take 1,000,000 samples
dataset = dataset["train"].shuffle(seed=42).select(range(1_000_000))

# Train-validation split
dataset = dataset.train_test_split(test_size=0.1)

print(dataset)

In [None]:
from transformers import (
    AutoTokenizer,
    EncoderDecoderConfig,
    EncoderDecoderModel,
    BertConfig
)

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

encoder_config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=512,
    num_hidden_layers=6,
    num_attention_heads=8,
    intermediate_size=2048,
    max_position_embeddings=512,
)

decoder_config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=512,
    num_hidden_layers=6,
    num_attention_heads=8,
    intermediate_size=2048,
    is_decoder=True,
    add_cross_attention=True,
    max_position_embeddings=512,
)

config = EncoderDecoderConfig.from_encoder_decoder_configs(
    encoder_config,
    decoder_config
)

model = EncoderDecoderModel(config=config)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
def preprocess_function(examples):
    inputs = [x["hi"] for x in examples["translation"]]
    targets = [x["en"] for x in examples["translation"]]

    model_inputs = tokenizer(
        inputs,
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    labels = tokenizer(
        targets,
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=2
)

print(tokenized_dataset)

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./scratch-hi-en",
    eval_strategy="steps", # Changed from evaluation_strategy
    save_strategy="steps",
    logging_steps=1000,
    save_steps=5000,
    eval_steps=5000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

print("Trainer ready")

In [None]:
trainer.train()

In [None]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.bos_token_id = tokenizer.cls_token_id

In [None]:
text = "हैलो आप कैसे हैं"

inputs = tokenizer(
    text,
    return_tensors="pt",
    max_length=128,
    truncation=True
).to(model.device)

outputs = model.generate(
    **inputs,
    max_length=128,
    decoder_start_token_id=tokenizer.cls_token_id,
    bos_token_id=tokenizer.cls_token_id,
    eos_token_id=tokenizer.sep_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Translation:", translation)