# Assignment 3: Adapting Languages with Fine-Tuning.

This assignment guides students through the process of adapting existing language models to a low-resource language, providing hands-on experience with modern neural machine translation techniques and transfer learning strategies.

In this notebook, we will dive into Yoruba-English.

## MENYO-20k Dataset Summary

The MENYO-20k dataset is a multi-domain parallel corpus designed for Yoruba-English neural machine translation (NMT). This dataset addresses the challenge of evaluating MT models on low-resource language pairs by providing a standardized evaluation set with clean orthography and diacritics. It includes texts from diverse domains such as news articles, TED talks, movie transcripts, and more. The dataset is publicly available under the CC BY-NC 4.0 license and is structured into training, development, and test splits for benchmarking.

### Dataset Collection Sources

| Source                         | Language Pair | No. Sentences |
|-------------------------------|---------------|---------------|
| Jehovah Witness News          | en-yo         | 3,508         |
| Voice of Nigeria News         | en-yo         | 3,048         |
| TED talks                     | en            | 2,945         |
| Global Voices News            | en-yo         | 2,932         |
| Yoruba Proverbs               | yo-en         | 2,700         |
| Out of His Mind Book          | en            | 2,014         |
| Software localization         | en            | 941           |
| Movie Transcript ("Unsane")   | yo-en         | 774           |
| Short texts                   | en            | 687           |
| Radio Broadcast Transcript    | en            | 258           |
| Creative Commons License      | en            | 193           |
| UDHR Translation              | en-yo         | 100           |
| **Total**                     |               | **20,100**    |

### Domains and Train-Test Splits

| Domain          | Training Set | Dev Set | Test Set |
|-----------------|--------------|---------|----------|
| News            | 4,995        | 1,391   | 3,102    |
| TED Talks       | 507          | 438     | 2,000    |
| Book            | -            | 1,006   | 1,008    |
| IT              | 356          | 312     | 273      |
| Yoruba Proverbs | 2,200        | 250     | 250      |
| Others          | 2,012        | -       | -        |


## Dataset Preprocess

In [None]:
%%capture
!pip install "unsloth[colab] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from jinja2 import Template
from random import randint

BASIC = "Translate the following text from {{ s_lang }} to {{ t_lang }}: {{ s_text }}"
DESCRIPTIVE = "Translate the following text from {{ s_lang }} to {{ t_lang }}. \n\n{{ s_lang }}: {{ s_text }} \n\n{{ t_lang }}:"
xP3 = "{{ s_text }} the previous text is in {{ s_lang }}. Here is a translation to {{ t_lang }} "

TEMPLATES = [{"template": Template(DESCRIPTIVE), "name": "descriptive"}, {"template": Template(xP3), "name": "xP3"}]

LANG_FLORES = {
    "afr": "afr_Latn",
    "amh": "amh_Ethi",
    "ara": "arz_Arab",
    "eng": "eng_Latn",
    "fra": "fra_Latn",
    "hau": "hau_Latn",
    "ibo": "ibo_Latn",
    "kin": "kin_Latn",
    "nya": "nya_Latn",
    "por": "por_Latn",
    "som": "som_Latn",
    "sna": "sna_Latn",
    "sot": "sot_Latn",
    "swa": "swh_Latn",
    "tir": "tir_Ethi",
    "xho": "xho_Latn",
    "yor": "yor_Latn",
    "zul": "zul_Latn",
}

LANG_NTREX = {
    "afr": "Afrikaans",
    "amh": "Amharic",
    "eng": "English",
    "fra": "French",
    "hau": "Hausa",
    "ibo": "Igbo",
    "kin": "Kinyarwanda",
    "mlg": "Malagasy",
    "nya": "Chichewa",
    "orm": "Afaan Oromoo",
    "por": "Portuguese",
    "som": "Somali",
    "sna": "Shona",
    "swa": "Swahili",
    "tir": "Tigrinya",
    "xho": "Xhosa",
    "yor": "Yoruba",
    "zul": "Zulu",
}

def get_prompt(s_lang, t_lang, s_text, t_text, s_code, t_code, source, split="train"):

    temp = randint(0, len(TEMPLATES) - 1)
    prompt = TEMPLATES[temp]
    is_reverse = randint(0, 1)

    if is_reverse:
            s_lang, t_lang = t_lang, s_lang
            s_text, t_text = t_text, s_text
            s_code, t_code = t_code, s_code

    return {
            "instruction": prompt["template"].render(s_lang=s_lang, t_lang=t_lang, s_text=s_text),
            "output": t_text,
            "lang": f"{s_code}-{t_code}",
            "split": split,
            "source": source,
            "task": "translation",
        }

In [None]:
from datasets import load_dataset
import json
import os

def get_dataset():
    dataset = load_dataset("menyo20k_mt")
    result = []

    for split in ["train", "validation", "test"]:
        data = dataset[split]

        for i in range(len(data)):
            s_code = "eng"
            t_code = "yor"
            s_text = data[i]["translation"]["en"]
            t_text = data[i]["translation"]["yo"]
            s_lang = "English"
            t_lang = "Yoruba"

            result.append(get_prompt(s_lang, t_lang, s_text, t_text, s_code, t_code, "MENYO", split))

    return result



def get__test_dataset(split):
    dataset = load_dataset("menyo20k_mt", split=split)
    result = []

    for i in range(len(dataset)):
        s_code = "eng"
        t_code = "yor"
        s_text = dataset[i]["translation"]["en"]
        t_text = dataset[i]["translation"]["yo"]
        s_lang = "English"
        t_lang = "Yoruba"

        result.append(get_prompt(s_lang, t_lang, s_text, t_text, s_code, t_code, "MENYO", split))


    return result

# Process the test split only
test_data = get__test_dataset("test")

file_path = "/content/data-test/MENYO_test_dataset.json"
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Save the test dataset to a JSON file
with open(file_path, "w") as f:
    json.dump(test_data, f, ensure_ascii=False)

file_path = "/content/data-train/MENYO_dataset.json"
os.makedirs(os.path.dirname(file_path), exist_ok=True)

with open(file_path, "w") as f:
    json.dump(get_dataset(), f, ensure_ascii=False)

from datasets import Dataset

dataset = Dataset.from_json("/content/data-train/MENYO_dataset.json")

In [None]:
dataset.push_to_hub("YourAccountName/DatasetName", token='')

## Training

In [None]:
model_name = "llama-lang-adapt/pretrain-wura"

max_seq_length = 4096
learning_rate = 1e-5
weight_decay = 0.01
max_steps = 500
warmup_steps = 50
batch_size = 2
gradient_accumulation_steps = 4
lr_scheduler_type = "linear"
optimizer = "adamw_8bit"
use_gradient_checkpointing = True
random_state = 3407

In [None]:
!huggingface-cli login

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
HAS_BFLOAT16 = torch.cuda.is_bf16_supported()

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = max_seq_length,
)

In [None]:
# @title Alpaca dataset preparation
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# # Example usage with a specific system prompt
system_prompt = "You are very proficient in Yoruba, and you are very good at responding in Yoruba."

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def formatting_prompts_func2(examples):
    instruction = system_prompt
    inputs       = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        # text = alpaca_prompt.format(instruction, input, output)
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = dataset.map(formatting_prompts_func2, batched = True,)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from transformers.utils import logging
logging.set_verbosity_info()

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        max_steps = max_steps,
        learning_rate = learning_rate,
        fp16 = not HAS_BFLOAT16,
        bf16 = HAS_BFLOAT16,
        logging_steps = 1,
        output_dir = "outputs",
        optim = optimizer,
        weight_decay = weight_decay,
        lr_scheduler_type = lr_scheduler_type,
        seed = random_state,
    ),
)

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
!nvidia-smi

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(
[
    alpaca_prompt.format(
        "You are very proficient in African languages, and you are very good at responding in those languages.", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

In [None]:
model.push_to_hub("YourAccountName/DatasetName", use_auth_token=True) # Online saving
tokenizer.push_to_hub("YourAccountName/DatasetName", use_auth_token=True) # Online saving

## Evaluation

In [None]:
from unsloth import FastLanguageModel
import json
import torch
import os

max_seq_length = 4096
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.
HAS_BFLOAT16 = torch.cuda.is_bf16_supported()

prefix = "You are very proficient in African languages, and you are very good at responding in those languages."

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


def save_first_50_rows(file_path, output_file_path):
    # Load prompts from a JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Extract the first 50 rows
    first_50_rows = data[:50]

    # Save the first 50 rows to a new JSON file
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, 'w') as f:
        json.dump(first_50_rows, f, indent=4, ensure_ascii=False)

    print(f"First 50 rows saved to {output_file_path}")
    return output_file_path


def generate_responses_with_unsloth(file_path, output_dir, max_pairs=50):
    # Load prompts from a JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Handle each generated output
    results = []
    for i, entry in enumerate(data[:max_pairs]):  # Process only the first max_pairs entries
        prompt = entry['instruction']

        inputs = tokenizer(
        [
            alpaca_prompt.format(
                prefix,  # instruction
                prompt,  # input
                "",  # output - leave this blank for generation!
            )
        ], return_tensors="pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens=128, use_cache=True)
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        split_outputs = [text.split('### Response:') for text in decoded_outputs]
        output = split_outputs[0][-1]
        clean_output = output.replace("</s>", "")

        result = {
            "instruction": prompt,
            "output": clean_output,
            "lang": entry.get('lang', 'N/A'),  # Handle optional fields
            "split": entry.get('split', 'N/A'),
            "source": entry.get('source', 'N/A'),
            "task": entry.get('task', 'N/A')
        }
        results.append(result)

    # Save results to a new JSON file
    os.makedirs(output_dir, exist_ok=True)
    output_filename = os.path.join(output_dir, os.path.basename(file_path))
    with open(output_filename, 'w') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

    print(f"Inference complete and results saved for {file_path}")

# Example usage
file_path = "/content/data-test/MENYO_test_dataset.json"
output_dir = "/content/inference"
first_50_file_path = "/content/data-eval/MENYO_test_first_50.json"

first_50_file_path = save_first_50_rows(file_path, first_50_file_path)

# Generate responses using the first 50 rows
generate_responses_with_unsloth(first_50_file_path, output_dir, max_pairs=50)


## Evaluation

In [None]:
import json
from datasets import load_metric
import os

def load_benchmark(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

def load_predictions(filepath):
    with open(filepath, 'r') as file:
        predictions = json.load(file)
    return predictions

def evaluate_translation(benchmarks, predictions):
    chrf = load_metric('chrf', trust_remote_code=True)

    results = []

    for bench, pred in zip(benchmarks, predictions):
        if bench['task'] == 'translation':
            ref = [[bench['output']]]
            hypo = [pred['output']]
            score = chrf.compute(predictions=hypo, references=ref)['score']
            results.append(score)

    return results

def run_translation_evaluation(benchmark_dir, prediction_dir, description):
    total_scores = []

    print(f"Translation Evaluation Results for {description}:")

    benchmark_files = sorted([os.path.join(benchmark_dir, f) for f in os.listdir(benchmark_dir) if f.endswith('.json')])
    prediction_files = sorted([os.path.join(prediction_dir, f) for f in os.listdir(prediction_dir) if f.endswith('.json')])

    for benchmark_file, prediction_file in zip(benchmark_files, prediction_files):
        benchmark_data = load_benchmark(benchmark_file)
        predictions_data = load_predictions(prediction_file)
        evaluation_results = evaluate_translation(benchmark_data, predictions_data)

        average_score = sum(evaluation_results) / len(evaluation_results) if evaluation_results else 0
        total_scores.append(average_score)

        print(f"{os.path.basename(benchmark_file)} - Average chrF Score: {average_score:.2f}")

    overall_average_score = sum(total_scores) / len(total_scores) if total_scores else 0
    print(f"Overall Average chrF Score for {description}: {overall_average_score:.2f}\n")

# Example usage:
benchmark_dir = "/content/data-eval/"
prediction_dir = "/content/inference/"

run_translation_evaluation(benchmark_dir, prediction_dir, "MenYo Fine-tune")
