In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_data = pd.read_csv("/content/Chittagong_Train - Sheet1.csv")
test_data = pd.read_csv("/content/chittagong_test - Sheet1.csv")
validation_data = pd.read_csv("/content/chittagong_validation - Sheet1.csv")

In [None]:
train_data.head(5)

In [None]:
# Rename the columns to match the expected format
train_data.rename(columns={'chittagong_bangla_speech': 'input_text', 'bangla_speech': 'labels'}, inplace=True)
train_data.head()

In [None]:
test_data.head(5)

In [None]:
# Rename the columns to match the expected format
test_data.rename(columns={'chittagong_bangla_speech': 'input_text', 'bangla_speech'	: 'labels'}, inplace=True)
test_data.head()

In [None]:
validation_data.head(5)

In [None]:
# Rename the columns to match the expected format
validation_data.rename(columns={'chittagong_bangla_speech': 'input_text', 'bangla_speech'	: 'labels'}, inplace=True)
validation_data.head()

In [None]:
!pip install transformers torch pandas

In [None]:
!pip install sacrebleu

In [None]:
!pip install rouge_score

In [None]:
!pip install sentencepiece

In [None]:
!pip install transformers[sentencepiece]

In [None]:
!transformers-cli cache clear

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [None]:
!pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
!pip install --upgrade pip

In [None]:
!pip install transformers==4.10.3

In [None]:
!pip install torch transformers

In [None]:
!pip install accelerate==0.20.3

In [None]:
!pip install git+https://github.com/csebuetnlp/normalizer

# **Load the model and tokenizer**

In [None]:
import torch
from transformers import MT5ForConditionalGeneration, AutoTokenizer
#https://huggingface.co/docs/transformers/model_doc/mt5
model_name = "google/mt5-small" # The variations it has -> mt5-small: 6, mt5-base: 12,mt5-large: 24, mt5-xl: 24, mt5-xxl: 24
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
import torch
from normalizer import normalize
from transformers import MT5ForConditionalGeneration, AutoTokenizer ,DataCollatorForSeq2Seq, Trainer, TrainingArguments
import os

# Load the saved model
model = MT5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/movie/mymensingh_translation_mT5.pt")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/movie/mymensingh_tokenizer_mT5.json")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.utils.data import Dataset, DataLoader
class Seq2SeqDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        """
        Args:
            data: A DataFrame containing 'input_text' and 'labels' columns.
            tokenizer: A Hugging Face tokenizer.
            max_length: Maximum sequence length.
        """
        self.input_text = data['input_text'].apply(normalize).tolist()
        self.labels = data['labels'].apply(normalize).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, idx):
        input_text = self.input_text[idx]
        label_text = self.labels[idx]

        # Tokenize the input text
        input_encodings = self.tokenizer(
            input_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        # Tokenize the label text to get its 'input_ids' and 'attention_mask'
        label_encodings = self.tokenizer(
            label_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encodings['input_ids'].squeeze(),
            'attention_mask': input_encodings['attention_mask'].squeeze(),
            'labels': label_encodings['input_ids'].squeeze(),
        }


In [None]:
# Create train , test and validation datasets
train_dataset = Seq2SeqDataset(train_data, tokenizer)
test_dataset = Seq2SeqDataset(test_data, tokenizer)
validation_dataset = Seq2SeqDataset(validation_data, tokenizer)

# Create train , test and validation dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)  #batch_size=32
test_dataloader = DataLoader(test_dataset, batch_size=16) #batch_size=32
validation_dataloader = DataLoader(validation_dataset, batch_size=16) #batch_size=32


In [None]:
# Move the model to the device (CPU or GPU)
model.to(device)

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import AdamW
from torch.optim import AdamW

# Create a custom optimizer using torch.optim.AdamW
custom_optimizer = AdamW(
    model.parameters(),
    lr=1e-3,  # Learning rate
    eps=1e-8,  # Epsilon value to prevent division by zero
    weight_decay=0.01,  # Weight decay (L2 regularization)
)

# Define the TrainingArguments for fine-tuning
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/movie/mymensingh_translation_mT5/model_fine_tuned',
    num_train_epochs=20,  # You can adjust the number of epochs
    per_device_train_batch_size=6,  # You can adjust the batch size
    gradient_accumulation_steps=8,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=2,
    save_steps=15000,
    learning_rate=1e-3,
    do_train=True,
    do_eval=True,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,
    lr_scheduler_type="cosine_with_restarts",
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/movie/mymensingh_translation_mT5/model_fine_tuned',
    logging_steps=100,
)


In [None]:
from transformers import DataCollatorForSeq2Seq

# Create a data collator for sequence-to-sequence tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,  # Your Hugging Face tokenizer
    model=model,
    padding=True,
    max_length=128,
    label_pad_token_id=tokenizer.pad_token_id,
)

In [None]:
trainer = Trainer(
    model=model,  # Use the model you loaded
    args=training_args,
    data_collator=data_collator,  # Use your data collator
    train_dataset=train_dataset,  # Use your training dataset
    eval_dataset=validation_dataset,  # Use your evaluation dataset
    optimizers=(custom_optimizer, None),  # Use your custom optimizer
)


In [None]:
# Fine-tune the model
trainer.train()

# **Save the model**

In [None]:
from transformers import AutoModelForSeq2SeqLM

# Save the model
model.save_pretrained("/content/drive/MyDrive/movie/mymensingh_translation_mT5.pt")

# Save the tokenizer
tokenizer.save_pretrained("/content/drive/MyDrive/movie/mymensingh_tokenizer_mT5.json")



# **Load the model again**

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the saved model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/movie/mymensingh_translation_mT5.pt")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/movie/mymensingh_tokenizer_mT5.json")


In [None]:
!pip install datasets

In [None]:
!pip install python-Levenshtein

In [None]:
!pip install jiwer

In [None]:
# Move the model to the device (CPU or GPU)
model.to(device)

In [None]:
!pip install rouge-score
#https://github.com/google-research/google-research/tree/master/rouge
#https://huggingface.co/spaces/evaluate-metric/rouge [Different types of ROUGE scores]

In [None]:
!pip install evaluate

In [None]:
import torch
import Levenshtein
from evaluate import load
# Define the move_to_device function
def move_to_device(batch, device):
    if isinstance(batch, torch.Tensor):
        return batch.to(device)
    elif isinstance(batch, list):
        return [move_to_device(item, device) for item in batch]
    elif isinstance(batch, dict):
        return {key: move_to_device(value, device) for key, value in batch.items()}
    else:
        return batch  # If it's not a tensor, list, or dict, leave it as is

# Load the evaluation metric for Character Error Rate (CER) and Word Error Rate (WER) and Exact Match(em)
cer_metric = load("cer")
wer_metric = load("wer")
meteor = load('meteor')
exact_match_metric = load("exact_match")

# Load BLEU and ROUGE metrics
bleu_metric = load("bleu")
rouge_metric = load('rouge')

# Initialize lists to store generated translations and references
generated_translations = []
references = []

# Generate translations for the test dataset
for batch in test_dataloader:
    # Move the batch to CUDA
    batch = move_to_device(batch, 'cuda')

    input_text = batch['input_ids']  # Access the input_text using the correct key
    labels = batch['labels']  # Access the labels using the correct key

    # Generate translations
    translation_ids = model.generate(input_text, max_length=128, num_beams=4, length_penalty=2.0, early_stopping=True)

    # Move the translation_ids to CPU to decode
    translation_ids = translation_ids.to('cpu')

    generated_translation = tokenizer.batch_decode(translation_ids, skip_special_tokens=True)

    generated_translations.extend(generated_translation)
    references.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))  # Decoding the label IDs

# Make sure to move generated_translations back to CPU for evaluation if necessary
generated_translations = [translation if not isinstance(translation, str) else translation for translation in generated_translations]



In [None]:
print("Number of generated translations:", len(generated_translations))
print("Number of references:", len(references))

In [None]:
print(generated_translations)

In [None]:
print(references)

In [None]:
# Calculate Character Error Rate (CER) and Word Error Rate (WER)
results_CER = cer_metric.compute(predictions=generated_translations, references=references)
results_WER = wer_metric.compute(predictions=generated_translations, references=references)

# Calculate Exact Match (EM) and METEOR(M)
results_em = exact_match_metric.compute(predictions=generated_translations, references=references)
results_met = meteor.compute(predictions=generated_translations, references=references)

# Calculate Bilingual Evaluation Understudy (BLEU) and Recall-Oriented Understudy for Gisting Evaluation (ROUGE)
results_bleu = bleu_metric.compute(predictions=generated_translations, references=references)
results_rouge = rouge_metric.compute(predictions=generated_translations, references=references)


# Calculate Levenshtein Distance
levenshtein_distances = [Levenshtein.distance(generated, reference) for generated, reference in zip(generated_translations, references)]


In [None]:
print(results_CER)
print(results_WER)
print(results_em)
print(results_met)
print(results_bleu)
print(results_rouge)
print(levenshtein_distances)

In [None]:
total_correct = 0
total_samples = len(references)

for generated, reference in zip(generated_translations, references):
    levenshtein_distance = Levenshtein.distance(generated, reference)
    max_length = max(len(generated), len(reference))
    accuracy = 1 - (levenshtein_distance / max_length)
    if accuracy >= 0.7:  # Adjust the threshold as needed
        total_correct += 1

accuracy = total_correct / total_samples
print("Accuracy:", accuracy)


# **Save translation results to a csv file**

In [None]:
import pandas as pd

# Create a DataFrame to store translations
translation_df = pd.DataFrame({
    'input_text': test_data['input_text'],  # Assuming 'test_data' contains your test dataset
    'labels': references,
    'translations': generated_translations
})

# Save translations to a CSV file
#translation_df.to_csv("/content/drive/MyDrive/sylhet_translation_results/sylhet_translation_mT5/model_fine_tuned/mBERT_sylhet_translation.csv", index=False)
# Save translations to a CSV file
translation_df.to_excel('/content/sample_data/chittagong_translations_mT5.xlsx', index=False)
