<a href="https://colab.research.google.com/github/MeAllan123/Dynamic-programing/blob/main/lusogatue_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from datasets import load_dataset

# Path to your cleaned CSV
csv_path = '/content/drive/MyDrive/ML_projects/model1/soga_english.csv'

# Load the CSV
raw = load_dataset("csv", data_files=csv_path)

# Rename columns for clarity
raw = raw["train"].rename_columns({"source":"en","target":"soga"})

# Split into train/validation
raw = raw.train_test_split(test_size=0.1, seed=42)

# Check the splits
print(raw)
print(raw["train"][0])


DatasetDict({
    train: Dataset({
        features: ['en', 'soga'],
        num_rows: 43893
    })
    test: Dataset({
        features: ['en', 'soga'],
        num_rows: 4877
    })
})
{'en': 'Alcohol possesses a threat to the health of others.', 'soga': "Omwenge gwa bulabe eri obulamu bw'abandi."}


In [3]:
from transformers import MarianMTModel, MarianTokenizer

# Model name for English → Swahili
model_name = "Helsinki-NLP/opus-mt-en-swc"

# Load tokenizer and model
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Quick check
print("English→Swahili MarianMT loaded successfully!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


English→Swahili MarianMT loaded successfully!


In [4]:
max_length = 128  # maximum sequence length

def preprocess(batch):
    # Ensure all entries are strings
    inputs = [str(x) for x in batch["en"]]
    targets = [str(x) for x in batch["soga"]]

    # Tokenize inputs and targets
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets separately
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_length,
            padding="max_length",
            truncation=True
        )

    # Add labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to train and validation sets
tokenized_train = raw["train"].map(preprocess, batched=True)
tokenized_val   = raw["test"].map(preprocess, batched=True)

# Quick check
print(tokenized_train[0])


{'en': 'Alcohol possesses a threat to the health of others.', 'soga': "Omwenge gwa bulabe eri obulamu bw'abandi.", 'input_ids': [14741, 19169, 13, 5392, 8, 5, 1013, 9, 274, 3, 0, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904, 58904], '

In [5]:
!pip install sacrebleu



In [6]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate
import numpy as np

# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Load BLEU metric for evaluation
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in labels as padding token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute BLEU score
    result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": result["score"]}

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./marian-soga-out",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=2,
    predict_with_generate=True,
    logging_steps=100,
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    num_train_epochs=5,
    learning_rate=5e-5,
    logging_dir="./logs",
    fp16=True  # mixed precision for faster training
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("Trainer is ready!")

Trainer is ready!


  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
[34m[1mwandb[0m: Currently logged in as: [33mmeregulwaallan7[0m ([33mmeregulwaallan7-soroti-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss
