In [1]:
pip install transformers torch torchvision torchaudio accelerate datasets sentencepiece sacrebleu sentencepiece huggingface_hub protobuf google


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import torch
print(torch.cuda.device_count())

1


In [3]:

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import numpy as np

# Load model and tokenizer
model_name = "facebook/seamless-m4t-v2-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import numpy as np


# Freeze all parameters
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last decoder layer and the language modeling head

for param in model.text_decoder.layers[-1].parameters():
    param.requires_grad = True
for param in model.lm_head.parameters():
    param.requires_grad = True

# Verify trainable parameters
print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"- {name}")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Load small dataset
limit = 1000000
hindi_sentences = [line.strip() for line in open("hindi.hi-filtered.hi", "r", encoding="utf-8").readlines()[:limit]]
tamil_sentences = [line.strip() for line in open("tamil.ta-filtered.ta", "r", encoding="utf-8").readlines()[:limit]]

# Create dataset
dataset = Dataset.from_dict({
    "translation": [{"src": hin, "tgt": tam} for hin, tam in zip(hindi_sentences, tamil_sentences)]
})

# Preprocess function for tokenization
def preprocess_function(examples):
    inputs = [ex["src"] for ex in examples["translation"]]
    targets = [ex["tgt"] for ex in examples["translation"]]
    
    # Tokenize inputs (Hindi)
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="longest",
        return_tensors="pt",
        src_lang="hin"  # Language code for Hindi
    )
    
    # Tokenize targets (Tamil)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            tgt_lang="tam"  # Language code for Tamil
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Process dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["translation"]
)

# Data collator with dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest"
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    save_strategy="no",
    per_device_train_batch_size=16,     
    gradient_accumulation_steps=16,       
    num_train_epochs=1,                  # Small number of epochs for testing
    logging_steps=500,                    
    prediction_loss_only=True,           # Only compute loss during training
    fp16=True,                           # Use mixed precision for efficiency
    report_to="none",                    # No external reporting (e.g., WandB)
    optim="adamw_torch_fused",           # Optimized AdamW
    learning_rate=5e-5,                  # Standard learning rate
    warmup_steps=100                     # Warmup for stable training
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model and display progress
print("Starting training...")
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("./fine_tuned_seamless")
tokenizer.save_pretrained("./fine_tuned_seamless")

Trainable parameters:
- shared.weight
- text_decoder.layers.23.self_attn.k_proj.weight
- text_decoder.layers.23.self_attn.k_proj.bias
- text_decoder.layers.23.self_attn.v_proj.weight
- text_decoder.layers.23.self_attn.v_proj.bias
- text_decoder.layers.23.self_attn.q_proj.weight
- text_decoder.layers.23.self_attn.q_proj.bias
- text_decoder.layers.23.self_attn.out_proj.weight
- text_decoder.layers.23.self_attn.out_proj.bias
- text_decoder.layers.23.self_attn_layer_norm.weight
- text_decoder.layers.23.self_attn_layer_norm.bias
- text_decoder.layers.23.cross_attention.k_proj.weight
- text_decoder.layers.23.cross_attention.k_proj.bias
- text_decoder.layers.23.cross_attention.v_proj.weight
- text_decoder.layers.23.cross_attention.v_proj.bias
- text_decoder.layers.23.cross_attention.q_proj.weight
- text_decoder.layers.23.cross_attention.q_proj.bias
- text_decoder.layers.23.cross_attention.out_proj.weight
- text_decoder.layers.23.cross_attention.out_proj.bias
- text_decoder.layers.23.cross_att

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Starting training...


Step,Training Loss
500,84.9237
1000,14.048
1500,11.1443
2000,9.9818
2500,9.0753
3000,8.724
3500,8.2529


('./fine_tuned_seamless/tokenizer_config.json',
 './fine_tuned_seamless/special_tokens_map.json',
 './fine_tuned_seamless/sentencepiece.bpe.model',
 './fine_tuned_seamless/added_tokens.json',
 './fine_tuned_seamless/tokenizer.json')