In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install transformers torch datasets



In [None]:
pip install sacrebleu



###**Paper Findings**

We demonstrate that baselines using current
LLMs are promising but fall short of human performance, achieving 44.7 chrF on
Kalamang to English translation and 45.8 chrF on English to Kalamang translation, compared to 51.6 and 57.0 chrF by a human who learned Kalamang from the
same reference materials.

###**Modular Version**

In [None]:
import pandas as pd
from torch.utils.data import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from sacrebleu.metrics import CHRF
import torch

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_len=128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source = self.source_texts[idx]
        target = self.target_texts[idx]

        source_enc = self.tokenizer(
            source, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
        )
        target_enc = self.tokenizer(
            target, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
        )

        labels = target_enc.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding for loss computation

        return {
            "input_ids": source_enc.input_ids.squeeze(),
            "attention_mask": source_enc.attention_mask.squeeze(),
            "labels": labels,
        }

In [None]:
def train_and_evaluate_translation(
    train_data_path, test_data_path, model_name, source_col, target_col, direction, max_len=128, num_epochs=5
):

    # Load datasets
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)

    # Tokenizer and model initialization
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)

    # Prepare datasets
    train_dataset = TranslationDataset(
        train_data[source_col].tolist(), train_data[target_col].tolist(), tokenizer, max_len
    )
    test_dataset = TranslationDataset(
        test_data[source_col].tolist(), test_data[target_col].tolist(), tokenizer, max_len
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./{direction.replace(' ', '_')}_model",
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        save_total_limit=2,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        fp16=True,
    )

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    # Train the model
    print(f"\nTraining {direction}...")
    trainer.train()

    # Save the model
    model.save_pretrained(f"./{direction.replace(' ', '_')}_model")
    tokenizer.save_pretrained(f"./{direction.replace(' ', '_')}_model")

    # Evaluation function
    def evaluate_model(test_dataset, model, tokenizer):
        predictions = []
        references = []

        model.eval()
        for i in range(len(test_dataset)):
            item = test_dataset[i]
            input_ids = item["input_ids"].unsqueeze(0).to(model.device)
            attention_mask = item["attention_mask"].unsqueeze(0).to(model.device)

            with torch.no_grad():
                outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_len)
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                labels = item["labels"].tolist()
                labels = [token for token in labels if token != -100]
                reference = tokenizer.decode(labels, skip_special_tokens=True)

            predictions.append(prediction)
            references.append(reference)

        return predictions, references

    # Evaluate the model
    predictions, references = evaluate_model(test_dataset, model, tokenizer)

    # Compute ChrF score
    chrf = CHRF()
    formatted_references = [[ref] for ref in references]
    chrf_score = chrf.corpus_score(predictions, formatted_references)

    # Display results
    print(f"\nEvaluation for {direction}:")
    print(f"ChrF Score: {chrf_score.score:.2f}")
    for pred, ref in zip(predictions[:5], references[:5]):
        print(f"Prediction: {pred}")
        print(f"Reference: {ref}")
        print("-" * 50)


## facebook/bart-base

In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="facebook/bart-base",
    source_col="Kalamang_Sentence",
    target_col="English_Translation",
    direction="Kalamang to English",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Training Kalamang to English...


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.8317,2.500745
2,2.0653,2.309027
3,1.7929,2.255368
4,1.6261,2.224027
5,1.2937,2.240428


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams


Evaluation for Kalamang to English:
ChrF Score: 35.46
Prediction: Aisa is climbing a tree over there.
Reference: Aisa orderd snails from me.
--------------------------------------------------
Prediction: Binkur's father bought a canoe, he wants to make a canoe with that.
Reference: Binkur's father can carve a canoe, and planks too.
--------------------------------------------------
Prediction: Manadu's gierun has a wound on his eye.
Reference: Manadu has a toothache, his cheek is swollen.
--------------------------------------------------
Prediction: Suci has a lot of fish.
Reference: Suci is preparing fish.
--------------------------------------------------
Prediction: You are a stranger!
Reference: Expel the chicken first!
--------------------------------------------------


In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="facebook/bart-base",
    source_col="English_Translation",
    target_col="Kalamang_Sentence",
    direction="English to Kalamang",
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Training English to Kalamang...


Epoch,Training Loss,Validation Loss
1,3.5083,3.109132
2,2.9184,2.785367
3,2.3283,2.662562
4,2.2103,2.605206
5,2.0249,2.589307


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams


Evaluation for English to Kalamang:
ChrF Score: 52.75
Prediction: Aisa orderd mat paruo reba.
Reference: Aisa ma yuotpanoi anggonggon.
--------------------------------------------------
Prediction: Binkur esun etkon komainda etkies, etkoni etkies.
Reference: Binkur esun bisa erat kies, bisa pawan weinun.
--------------------------------------------------
Prediction: Manadu kanggirun ning, ning.
Reference: Manadu gierun ningda koliep rua.
--------------------------------------------------
Prediction: Suci sorat paruotkin.
Reference: Suci sor paruo reba.
--------------------------------------------------
Prediction: Kokok kokokat paruotkin!
Reference: Ka tok kokoat arteǃ
--------------------------------------------------
