# 1. Dataset Preparation

In [None]:
%pip install datasets

In [None]:
# Loading and Exploring the Dataset

from datasets import load_dataset

# Load the Sunbird SALT dataset
dataset = load_dataset('Sunbird/salt-dataset')

# Explore the dataset's structure
print(dataset)
print(dataset['train'].features)  # Display dataset features
print(len(dataset['train']))      # Display number of examples in the training set


In [None]:
# Preprocessing Text Data for the Entire Dataset

import json
import re
import string

def preprocess_text(text):
    # Remove unwanted characters, such as punctuation and special symbols
    text = re.sub(r'[^\w\s]', '', text)

    # Convert text to lowercase
    text = text.lower()

    # Split text into word-like units (tokens)
    tokens = text.split()

    return tokens

# Initialize lists to store preprocessed texts
clean_english_texts = []
clean_luganda_texts = []

# Loop through all examples in the training set
for example in dataset['train']:
    # Parse the JSON string in the 'text' field
    text_data = json.loads(example['text'])

    # Check for 'English' and 'Luganda' keys in the parsed text data
    if 'English' in text_data and 'Luganda' in text_data:
        english_text = text_data['English']
        luganda_text = text_data['Luganda']

        # Preprocess the English and Luganda texts
        clean_english_text = preprocess_text(english_text)
        clean_luganda_text = preprocess_text(luganda_text)

        # Append preprocessed texts to the respective lists
        clean_english_texts.append(clean_english_text)
        clean_luganda_texts.append(clean_luganda_text)

# Display the preprocessed texts for a few examples
for i in range(5):
    print("Example", i+1)
    print("Clean English Text:", clean_english_texts[i])
    print("Clean Luganda Text:", clean_luganda_texts[i])
    print()


# 2. Model Architecture

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "michaeltendo/luganda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Example translation
english_text = "I am a boy"
input_ids = tokenizer(luganda_text, return_tensors="pt")["input_ids"]
outputs = model.generate(input_ids)
luganda_translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Luganda Text:", english_text)
print("English Translation:", luganda_translation)


# 3. Model Training

In [3]:
# Dataset Preparation

from sklearn.model_selection import train_test_split

# Luganda-English pairs stored in lists `clean_luganda_texts` and `clean_english_texts`
luganda_train, luganda_val, english_train, english_val = train_test_split(
    clean_luganda_texts, clean_english_texts, test_size=0.1, random_state=42
)

# Further split validation set into validation and test sets
luganda_val, luganda_test, english_val, english_test = train_test_split(
    luganda_val, english_val, test_size=0.5, random_state=42
)


In [20]:
# Print a few samples from each dataset
def print_samples(dataset_name, luganda_texts, english_texts, num_samples=1):
    print(f"Sample from {dataset_name} dataset:")
    for i in range(num_samples):
        print(f"Luganda Text: {luganda_texts[i]}")
        print(f"English Translation: {english_texts[i]}")
        print("")

# Print samples from training dataset
print_samples("Training", luganda_train, english_train)

# Print samples from validation dataset
print_samples("Validation", luganda_val, english_val)

# Print samples from test dataset
print_samples("Test", luganda_test, english_test)


Sample from Training dataset:
Luganda Text: Ellipsis
English Translation: Ellipsis

Sample from Validation dataset:
Luganda Text: Ellipsis
English Translation: Ellipsis

Sample from Test dataset:
Luganda Text: ['ku', 'mwoleso', 'gwengoye', 'abantu', 'boolesa', 'emisono', 'gyabwe', 'emipya']
English Translation: ['at', 'the', 'fashion', 'show', 'people', 'design', 'clothes', 'and', 'showcase', 'their', 'new', 'designs']



In [None]:
%pip install sacremoses

In [6]:
# Define Training Hyperparameters

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
from torch.optim import Adam

# Hyperparameters
learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Initialize model and tokenizer
model_name = "michaeltendo/luganda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define optimizer
optimizer = Adam(model.parameters(), lr=learning_rate)




In [None]:
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assuming you have defined a custom dataset class, let's call it TranslationDataset
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(self, english_texts, luganda_texts, tokenizer, max_length=128):
        self.english_texts = english_texts
        self.luganda_texts = luganda_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.english_texts)

    def __getitem__(self, idx):
        english_text = self.english_texts[idx]
        luganda_text = self.luganda_texts[idx]

        # Tokenize inputs and labels for the model
        inputs = self.tokenizer(
            english_text,
            text_pair=luganda_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True,
            add_special_tokens=True
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": inputs["input_ids"].squeeze(),  # Labels are the same as inputs for seq2seq
            "labels_attention_mask": inputs["attention_mask"].squeeze()
        }


# Prepare datasets with aligned texts
luganda_train = [...]  # List of Luganda texts
english_train = [...]  # List of corresponding English translations

# Create tokenizer and model
model_name = "michaeltendo/luganda"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Create training dataset
train_dataset = TranslationDataset(english_train, luganda_train, tokenizer)

# DataLoader parameters
batch_size = 8
num_epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

# Training loop
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        labels_attention_mask = batch["labels_attention_mask"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=labels_attention_mask)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Avg. Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    # Evaluate on validation set (similar to training loop)
    # Monitor metrics and prevent overfitting
