In [1]:
!pip install transformers datasets sentencepiece --quiet


In [1]:
# input_file = "eng-tel.txt"
# output_file = "eng-tel.jsonl"

# with open(input_file, "r", encoding="utf-8") as f:
#     lines = [line.strip() for line in f if line.strip()]  # skip blank lines

# json_data = []
# for i in range(0, len(lines) - 1, 2):
#     if lines[i].startswith("en:") and lines[i+1].startswith("te:"):
#         en = lines[i].replace("en:", "").strip()
#         te = lines[i+1].replace("te:", "").strip()
#         json_data.append({"translation": {"en": en, "te": te}})
#     else:
#         print(f"⚠️ Skipping malformed pair at lines {i} and {i+1}")

# # Save to JSONL
# with open(output_file, "w", encoding="utf-8") as f:
#     for item in json_data:
#         f.write(f"{item}\n")

# print(f"✅ Successfully converted {len(json_data)} sentence pairs to {output_file}")

import json

input_file = "eng-tel.txt"
output_file = "eng-tel.jsonl"

with open(input_file, "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

json_data = []
for i in range(0, len(lines) - 1, 2):
    if lines[i].startswith("en:") and lines[i+1].startswith("te:"):
        en = lines[i].replace("en:", "").strip()
        te = lines[i+1].replace("te:", "").strip()
        json_obj = {"translation": {"en": en, "te": te}}
        json_data.append(json_obj)
    else:
        print(f"⚠️ Skipped malformed pair at lines {i} and {i+1}")

with open(output_file, "w", encoding="utf-8") as f:
    for obj in json_data:
        json.dump(obj, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Fixed and wrote {len(json_data)} valid JSONL lines to {output_file}")


✅ Fixed and wrote 30 valid JSONL lines to eng-tel.jsonl


In [2]:
from datasets import load_dataset
from transformers import MBart50TokenizerFast

model_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "te_IN"

dataset = load_dataset("json", data_files="eng-tel.jsonl", split="train")

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["te"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True, padding="max_length")
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 30 examples [00:00, 2961.59 examples/s]
Map: 100%|██████████| 30/30 [00:00<00:00, 1315.14 examples/s]


In [3]:
from transformers import (
    MBartForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
import torch

# Load pretrained model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart-en-te-checkpoints",
    per_device_train_batch_size=2,
    learning_rate=3e-5,
    weight_decay=0.01,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),  # Only use if you have GPU
    logging_steps=10,
    logging_dir="./logs"
)

# Data collator handles dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train model
trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss
10,10.3023
20,8.7441
30,7.581
40,6.913




TrainOutput(global_step=45, training_loss=8.192110866970486, metrics={'train_runtime': 318.1875, 'train_samples_per_second': 0.283, 'train_steps_per_second': 0.141, 'total_flos': 24380209889280.0, 'train_loss': 8.192110866970486, 'epoch': 3.0})

In [4]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("./mbart-en-te-model")
tokenizer.save_pretrained("./mbart-en-te-model")


('./mbart-en-te-model\\tokenizer_config.json',
 './mbart-en-te-model\\special_tokens_map.json',
 './mbart-en-te-model\\sentencepiece.bpe.model',
 './mbart-en-te-model\\added_tokens.json',
 './mbart-en-te-model\\tokenizer.json')

In [5]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load fine-tuned model
model = MBartForConditionalGeneration.from_pretrained("./mbart-en-te-model")
tokenizer = MBart50TokenizerFast.from_pretrained("./mbart-en-te-model")

# Set source and target language codes
tokenizer.src_lang = "en_XX"
tgt_lang = "te_IN"


In [3]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

# Load the fine-tuned model
model_path = "./mbart-en-te-model"  # Change if your path is different
model = MBartForConditionalGeneration.from_pretrained(model_path)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)

# Set language codes
tokenizer.src_lang = "en_XX"
target_lang = "te_IN"

def translate_to_telugu(prompt: str) -> str:
    # Tokenize the input English sentence
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate output in Telugu
    translated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang],
        max_length=100
    )

    # Decode the output tokens to Telugu text
    telugu_output = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    return telugu_output

# Example usage
eng_input = "I want to become a great software engineer."
telugu_output = translate_to_telugu(eng_input)
print("Telugu Translation:", telugu_output)


  from .autonotebook import tqdm as notebook_tqdm


Telugu Translation: నాకు ఒక గొప్ప సామగ్రి ఇంజనీర్ మారింది కోరుకుంటున్నారు.


In [None]:
# English sentence
english_sentence = ""

# Tokenize
inputs = tokenizer(english_sentence, return_tensors="pt")

# Set decoder start token to Telugu
generated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
    max_length=50
)

# Decode and print
telugu_translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
print("Translated Telugu:", telugu_translation)


Translated Telugu: మైక్రోస్ ఏమిటి?
