<a href="https://colab.research.google.com/github/Joytarus/Joy-sMailserver/blob/main/app_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### Step-by-Step Implementation

#### 1. Set Up the Environment
!pip install datasets
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    MBartTokenizer,
    MBartForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

#### 2. Prepare Your Swahili Dataset

# Simulate a small Swahili dataset (replace with your actual data)
swahili_sentences = [
    "Leo ninaenda sokoni.",
    "Jua linawaka sana leo.",
    "Nimepika chakula cha jioni.",
    "Kesho nitasoma vitabu.",
    "Watu wengi walienda shuleni."
]

# Save to a text file for consistency with the document
with open("swahili.txt", "w", encoding="utf-8") as f:
    for sentence in swahili_sentences:
        f.write(sentence + "\n")

# Load the dataset from the text file
def load_swahili_text(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

swahili_sentences = load_swahili_text("swahili.txt")

# Convert to a Hugging Face Dataset
dataset = Dataset.from_dict({"text": swahili_sentences})

#### 3. Tokenization
# Load the tokenizer for mBART-50
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50")
tokenizer.pad_token = tokenizer.eos_token  # Ensure padding token is set

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split into train/test sets
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

#### 4. Data Collator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM for sentence completion
)

#### 5. Load the Model
model = MBartForCausalLM.from_pretrained("facebook/mbart-large-50")
model.resize_token_embeddings(len(tokenizer))  # Resize if new tokens were added

#### 6. Define Training Arguments
training_args = TrainingArguments(
    output_dir="./swahili-mbart-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Start with one epoch
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,  # Reduced for faster feedback on small dataset
    learning_rate=5e-5,
    warmup_steps=10,  # Reduced for small dataset
    weight_decay=0.01,
    push_to_hub=False
)

#### 7. Initialize and Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

#### 8. Evaluate and Save the Model

# Evaluate
eval_results = trainer.evaluate()
print(f"Perplexity: {torch.exp(torch.tensor(eval_results['eval_loss']))}")

# Save the model and tokenizer
trainer.save_model("./swahili-mbart-finetuned-final")
tokenizer.save_pretrained("./swahili-mbart-finetuned-final")

#### 9. Test the Fine-Tuned Model (Inference)
from transformers import pipeline

# Load the fine-tuned model and tokenizer
generator = pipeline(
    "text-generation",
    model="./swahili-mbart-finetuned-final",
    tokenizer="./swahili-mbart-finetuned-final"
)

# Test with a prompt
prompt = "Leo ninaenda"
outputs = generator(prompt, max_length=50, num_return_sequences=1)
for out in outputs:
    print(out["generated_text"])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjtarus02[0m ([33mjtarus02-sasa-it[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,20.757423




Perplexity: 1034747648.0


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Leo ninaenda Поділитися:


In [2]:
pip install streamlit


Collecting streamlit
  Downloading streamlit-1.42.2-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.42.2-py2.py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?

In [4]:
import streamlit as st
from transformers import pipeline

# Load the fine-tuned model
generator = pipeline(
    "text-generation",
    model="./swahili-mbart-finetuned-final",
    tokenizer="./swahili-mbart-finetuned-final"
)

# UI with Streamlit
st.title("Swahili Sentence Completion")
st.write("Enter a Swahili phrase, and the AI will complete it.")

# User input
prompt = st.text_input("Type your Swahili sentence:")

if prompt:
    outputs = generator(prompt, max_length=50, num_return_sequences=1)
    st.write("Generated Sentence:", outputs[0]["generated_text"])


Device set to use cpu
