<a href="https://colab.research.google.com/github/KitsoHub/NLPBase/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# =============================
# 🧰 STEP 1: INSTALL LIBRARIES
# =============================
!pip install -q transformers datasets torch sentencepiece accelerate gradio

# =============================
# 📦 STEP 2: IMPORT EVERYTHING
# =============================
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    pipeline
)
from datasets import load_dataset
import gradio as gr

print("✅ Libraries installed and imported!")

# =============================
# 📥 STEP 3: LOAD DATASET
# =============================
dataset = load_dataset("ogaufi/NLP_setswana")
print(f"✅ Dataset loaded. Train size: {len(dataset['train'])}")
print("Sample:", dataset["train"][0])

# =============================
# 🤖 STEP 4: LOAD MODEL & TOKENIZER
# =============================
model_name = "Helsinki-NLP/opus-mt-en-af"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("✅ Model and tokenizer loaded!")

# =============================
# 🧹 STEP 5: PREPROCESS FUNCTION
# =============================
def preprocess_function(examples):
    # Tokenize English (source)
    inputs = tokenizer(
        examples["en"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    # Tokenize Setswana (target/labels)
    labels = tokenizer(
        examples["st"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels["input_ids"],  # required for training
    }

# Apply preprocessing
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=16)
print("✅ Dataset tokenized!")

# =============================
# ⚙️ STEP 6: TRAINING SETUP
# =============================
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./marianmt-en-tn-finetuned",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),  # auto-enable if GPU
    logging_steps=50,
    report_to="none",
    optim="adamw_torch",
    save_strategy="epoch",
    load_best_model_at_end=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"].select(range(10)) if len(tokenized_dataset["train"]) > 10 else tokenized_dataset["train"],  # dummy eval
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("✅ Trainer configured!")

# =============================
# 🏁 STEP 7: START TRAINING
# =============================
print("🚀 Starting training...")
trainer.train()
print("🎉 Training completed!")

# =============================
# 💾 STEP 8: SAVE MODEL LOCALLY
# =============================
trainer.save_model("./final-model")
tokenizer.save_pretrained("./final-model")
print("✅ Model saved locally!")

# =============================
# 🔍 STEP 9: TEST TRANSLATION
# =============================
# Load the trained model + tokenizer for inference
trained_model = AutoModelForSeq2SeqLM.from_pretrained("./final-model")
trained_tokenizer = AutoTokenizer.from_pretrained("./final-model")

# Create translation pipeline
translator = pipeline(
    "translation",
    model=trained_model,
    tokenizer=trained_tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test
test_sentences = [
    "Good morning sir",
    "How are you?",
    "Thank you",
    "I am fine"
]

print("\n🧪 TRANSLATION TESTS:")
for sent in test_sentences:
    result = translator(sent, max_length=128)
    print(f"'{sent}' → '{result[0]['translation_text']}'")

# =============================
# 🌐 STEP 10: CREATE GRADIO DEMO (OPTIONAL)
# =============================
def translate_text(text):
    if not text.strip():
        return ""
    result = translator(text, max_length=128)
    return result[0]["translation_text"]

demo = gr.Interface(
    fn=translate_text,
    inputs=gr.Textbox(lines=3, placeholder="Type English text here..."),
    outputs="text",
    title="🌍 English → Setswana Translator",
    description="Fine-tuned MarianMT model for Setswana. Built with ❤️ for Botswana.",
    examples=[
        ["Good morning madam"],
        ["How are you today?"],
        ["I am well, thank you."],
        ["See you tomorrow!"]
    ],
    theme="soft"
)

print("\n🌐 Launching Gradio demo (public link will appear below)...")
demo.launch(share=True)  # Generates a public link you can share!

✅ Libraries installed and imported!
✅ Dataset loaded. Train size: 419
Sample: {'en': 'Good morning/ day/ evening sir', 'st': 'dumêla rra'}




✅ Model and tokenizer loaded!


Map:   0%|          | 0/419 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


✅ Dataset tokenized!
✅ Trainer configured!
🚀 Starting training...




Epoch,Training Loss,Validation Loss
1,2.0212,0.612529
2,0.3357,0.398462
3,0.2587,0.297208
4,0.2218,0.233633
5,0.1997,0.177522
6,0.1828,0.146284
7,0.1687,0.124159
8,0.1571,0.113387
9,0.156,0.1026
10,0.1517,0.099417




🎉 Training completed!
✅ Model saved locally!


Device set to use cpu



🧪 TRANSLATION TESTS:
'Good morning sir' → 'Dumêla'
'How are you?' → 'Le tsogile jang?'
'Thank you' → 'Re a batla'
'I am fine' → 'ke tsogile'

🌐 Launching Gradio demo (public link will appear below)...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a5e9518e6cf28d78b6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [4]:
from huggingface_hub import notebook_login
notebook_login()  # paste your HF token
trainer.push_to_hub("ogaufi/marianmt-en-tn-start")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...marianmt-en-tn-finetuned/target.spm: 100%|##########|  826kB /  826kB            

  ...marianmt-en-tn-finetuned/source.spm: 100%|##########|  819kB /  819kB            

  ...t-en-tn-finetuned/model.safetensors:   0%|          |  559kB /  294MB            

  ...t-en-tn-finetuned/training_args.bin:   2%|2         |   125B / 5.91kB            

CommitInfo(commit_url='https://huggingface.co/ogaufi/marianmt-en-tn-finetuned/commit/37f39f66acd32d3a9fb6e2fbbfb0be897efe94c2', commit_message='ogaufi/marianmt-en-tn-start', commit_description='', oid='37f39f66acd32d3a9fb6e2fbbfb0be897efe94c2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ogaufi/marianmt-en-tn-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='ogaufi/marianmt-en-tn-finetuned'), pr_revision=None, pr_num=None)