In [None]:
!pip install datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset, DatasetDict
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
english_sentences = []
telugu_sentences = []
with open("/content/drive/MyDrive/english_telugu_data.txt", mode='rt', encoding='utf-8') as fp:
    for line in fp.readlines():
        eng_tel = line.split("++++$++++")
        english_sentences.append(eng_tel[0])
        telugu_sentences.append(eng_tel[1].strip())

In [None]:
data = pd.DataFrame({"english_sentences": english_sentences, "telugu_sentences": telugu_sentences})

In [None]:
data = data.iloc[:70000, :]

In [None]:
import re
def clean_eng(text):
    # Replace contractions, remove punctuation, and lowercase
    contraction_mapping = {
        "ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have",
        "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not",
        "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
        "he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",
        "I'd": "I would", "I'll": "I will", "I'm": "I am", "I've": "I have", "isn't": "is not",
        "it's": "it is", "let's": "let us", "ma'am": "madam", "might've": "might have",
        "mightn't": "might not", "must've": "must have", "mustn't": "must not",
        "needn't": "need not", "shan't": "shall not", "she'd": "she would", "she'll": "she will",
        "she's": "she is", "should've": "should have", "shouldn't": "should not",
        "that's": "that is", "there's": "there is", "they'd": "they would", "they'll": "they will",
        "they're": "they are", "they've": "they have", "we'd": "we would", "we'll": "we will",
        "we're": "we are", "we've": "we have", "weren't": "were not", "what's": "what is",
        "where's": "where is", "who's": "who is", "won't": "will not", "would've": "would have",
        "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are",
        "you've": "you have"
    }
    text = text.lower()
    text = ' '.join([contraction_mapping.get(w, w) for w in text.split()])
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove digits
    return text.strip()

def clean_tel(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = re.sub(r"[౦-౯]", "", text)  # Remove Telugu digits
    return text.strip()

data["english_sentences"] = data["english_sentences"].apply(clean_eng)
data["telugu_sentences"] = data["telugu_sentences"].apply(clean_tel)


In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=12)

In [None]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = tokenizer(examples["english_sentences"], max_length=64, truncation=True, padding="max_length")
    targets = tokenizer(examples["telugu_sentences"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/56000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/english_to_telugu_nllb",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    save_strategy="epoch",
    logging_dir="./logs"
)



In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)


  trainer = Seq2SeqTrainer(


In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.1451,0.116573




TrainOutput(global_step=3500, training_loss=0.521919675554548, metrics={'train_runtime': 2348.7724, 'train_samples_per_second': 23.842, 'train_steps_per_second': 1.49, 'total_flos': 7584866107392000.0, 'train_loss': 0.521919675554548, 'epoch': 1.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/english_to_telugu_nllb")
tokenizer.save_pretrained("/content/drive/MyDrive/english_to_telugu_nllb")

('/content/drive/MyDrive/english_to_telugu_nllb/tokenizer_config.json',
 '/content/drive/MyDrive/english_to_telugu_nllb/special_tokens_map.json',
 '/content/drive/MyDrive/english_to_telugu_nllb/sentencepiece.bpe.model',
 '/content/drive/MyDrive/english_to_telugu_nllb/added_tokens.json',
 '/content/drive/MyDrive/english_to_telugu_nllb/tokenizer.json')

In [None]:
import torch

test_sentences = test_data["english_sentences"].tolist()[:5]
inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True, max_length=64)

# Move the input tensors to the same device as the model
for key in inputs:
    inputs[key] = inputs[key].to(model.device) # model.device will get the device of the model

outputs = model.generate(**inputs)

In [None]:
for i, sentence in enumerate(test_sentences):
    print(f"English: {sentence}")
    print(f"Translated Telugu: {tokenizer.decode(outputs[i], skip_special_tokens=True)}")
    print()

English: tom installed an alarm system in his house
Translated Telugu: టమ తన ఇటల అలర ససటమన సటప చశడ

English: ive lived here my whole life
Translated Telugu: నన న జవతతత ఇకకడ నవసచన

English: i wanted to move to boston
Translated Telugu: నన బసటనక వళలలలనకననన

English: tom would not go without saying goodbye
Translated Telugu: టమ వడకల చపపకడ వళళడ

English: it was good working with you
Translated Telugu: మత పనచయడ మచద

