In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install transformers torch datasets



In [None]:
pip install sacrebleu



In [None]:
pip install accelerate



###**Paper Findings**

We demonstrate that baselines using current
LLMs are promising but fall short of human performance, achieving 44.7 chrF on
Kalamang to English translation and 45.8 chrF on English to Kalamang translation, compared to 51.6 and 57.0 chrF by a human who learned Kalamang from the
same reference materials.

###**Modular Version**

In [None]:
import pandas as pd
from torch.utils.data import Dataset
from sacrebleu.metrics import CHRF
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import MarianTokenizer, MarianMTModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import LlamaForCausalLM
from transformers import LlamaTokenizer
from accelerate import Accelerator
import torch

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_len=128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source = self.source_texts[idx]
        target = self.target_texts[idx]

        source_enc = self.tokenizer(
            source, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
        )
        target_enc = self.tokenizer(
            target, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
        )

        labels = target_enc.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding for loss computation

        return {
            "input_ids": source_enc.input_ids.squeeze(),
            "attention_mask": source_enc.attention_mask.squeeze(),
            "labels": labels,
        }

In [None]:
def train_and_evaluate_translation(
    train_data_path, test_data_path, model_name, source_col, target_col, direction, max_len=128, num_epochs=5
):

    # Load datasets
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)

    # # Tokenizer and model initialization - Model 1
    # tokenizer = T5Tokenizer.from_pretrained(model_name)
    # model = T5ForConditionalGeneration.from_pretrained(model_name)

    # # Tokenizer and model initialization
    # tokenizer = MarianTokenizer.from_pretrained(model_name)
    # model = MarianMTModel.from_pretrained(model_name)

    # # # Tokenizer and model initialization - Model 2
    # # token="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"
    # # tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
    # # model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # # Tokenizer and model initialization - Model 3
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer(
    ["This is a test.", "Fine-tuning GPT-2 for translation tasks."],
    max_length=128,
    truncation=True,
    padding=True,  # Enables padding
    return_tensors="pt"
    )
    print("Input IDs:", inputs["input_ids"])
    print("Attention Mask:", inputs["attention_mask"])

    # Prepare datasets
    train_dataset = TranslationDataset(
        train_data[source_col].tolist(), train_data[target_col].tolist(), tokenizer, max_len
    )
    test_dataset = TranslationDataset(
        test_data[source_col].tolist(), test_data[target_col].tolist(), tokenizer, max_len
    )

    training_args = TrainingArguments(
    output_dir="./improved_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,  # Lower learning rate
    per_device_train_batch_size=64,  # Larger batch size
    per_device_eval_batch_size=64,  # Larger batch size
    gradient_accumulation_steps=16,  # Simulate larger batch size
    num_train_epochs=32,  # Train longer
    weight_decay=0.01,
    save_total_limit=2,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,  # Mixed precision for faster and stable training
    )

    # Prepare model and datasets using Accelerator
    accelerator = Accelerator()
    model, train_dataset, test_dataset = accelerator.prepare(model, train_dataset, test_dataset)

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    # Train the model
    print(f"\nTraining {direction}...")
    trainer.train()

    # Save the model
    model.save_pretrained(f"./{direction.replace(' ', '_')}_model")
    tokenizer.save_pretrained(f"./{direction.replace(' ', '_')}_model")

    # Evaluation function
    def evaluate_model(test_dataset, model, tokenizer):
        predictions = []
        references = []

        model.eval()
        for i in range(len(test_dataset)):
            item = test_dataset[i]
            input_ids = item["input_ids"].unsqueeze(0).to(model.device)
            attention_mask = item["attention_mask"].unsqueeze(0).to(model.device)

            with torch.no_grad():
                outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=max_len)
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                labels = item["labels"].tolist()
                labels = [token for token in labels if token != -100]
                reference = tokenizer.decode(labels, skip_special_tokens=True)

            predictions.append(prediction)
            references.append(reference)

        return predictions, references

    # Evaluate the model
    predictions, references = evaluate_model(test_dataset, model, tokenizer)

    # Compute ChrF score
    chrf = CHRF()
    formatted_references = [[ref] for ref in references]
    chrf_score = chrf.corpus_score(predictions, formatted_references)

    # Display results
    print(f"\nEvaluation for {direction}:")
    print(f"ChrF Score: {chrf_score.score:.2f}")
    for pred, ref in zip(predictions[:5], references[:5]):
        print(f"Prediction: {pred}")
        print(f"Reference: {ref}")
        print("-" * 50)

### Model 1

## T5-small

In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="t5-small",
    source_col="Kalamang_Sentence",
    target_col="English_Translation",
    direction="Kalamang to English",
)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(



Training Kalamang to English...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,4.5896,4.022567
2,4.006,3.777533
3,3.8263,3.609691
4,3.7537,3.494801
5,3.6422,3.41583
6,3.5426,3.354906
7,3.4749,3.304664
8,3.4519,3.262745
9,3.3854,3.227698
10,3.3129,3.19848



Evaluation for Kalamang to English:
ChrF Score: 42.96
Prediction: Aisa's mother is a tad.
Reference: Aisa orderd snails from me.
--------------------------------------------------
Prediction: Binkur's father and family are sat there, they are sat there.
Reference: Binkur's father can carve a canoe, and planks too.
--------------------------------------------------
Prediction: Manadu's mother is a samaritan.
Reference: Manadu has a toothache, his cheek is swollen.
--------------------------------------------------
Prediction: Suci's father is a sacramento.
Reference: Suci is preparing fish.
--------------------------------------------------
Prediction: The sand is a bit thick.
Reference: Expel the chicken first!
--------------------------------------------------


In [None]:
from google.colab import files
!zip -r /content/improved_model.zip /content/improved_model
files.download('/content/improved_model.zip')

  adding: content/improved_model/ (stored 0%)
  adding: content/improved_model/checkpoint-800/ (stored 0%)
  adding: content/improved_model/checkpoint-800/tokenizer_config.json (deflated 94%)
  adding: content/improved_model/checkpoint-800/spiece.model (deflated 48%)
  adding: content/improved_model/checkpoint-800/rng_state.pth (deflated 25%)
  adding: content/improved_model/checkpoint-800/optimizer.pt (deflated 7%)
  adding: content/improved_model/checkpoint-800/model.safetensors (deflated 10%)
  adding: content/improved_model/checkpoint-800/trainer_state.json (deflated 82%)
  adding: content/improved_model/checkpoint-800/training_args.bin (deflated 51%)
  adding: content/improved_model/checkpoint-800/special_tokens_map.json (deflated 85%)
  adding: content/improved_model/checkpoint-800/config.json (deflated 62%)
  adding: content/improved_model/checkpoint-800/generation_config.json (deflated 29%)
  adding: content/improved_model/checkpoint-800/scheduler.pt (deflated 55%)
  adding: co

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="t5-small",
    source_col="English_Translation",
    target_col="Kalamang_Sentence",
    direction="English to Kalamang",
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(



Training English to Kalamang...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
0,No log,5.273307
1,No log,5.24443
2,No log,5.24443
3,No log,5.224661
4,No log,5.191185
5,No log,5.191185
6,No log,5.164754
7,No log,5.098314
8,5.475600,5.076543
9,5.475600,5.057826



Evaluation for English to Kalamang:
ChrF Score: 15.69
Prediction: Aisa a commandé des schnecken von mir.
Reference: Aisa ma yuotpanoi anggonggon.
--------------------------------------------------
Prediction: Binkur's father can carve canoe, and planks too.
Reference: Binkur esun bisa erat kies, bisa pawan weinun.
--------------------------------------------------
Prediction: Manadu hat eine Zahnache, seine Wange ist swollen.
Reference: Manadu gierun ningda koliep rua.
--------------------------------------------------
Prediction: Suci bereitet Fisch.
Reference: Suci sor paruo reba.
--------------------------------------------------
Prediction: Expel the chicken first!
Reference: Ka tok kokoat arte
--------------------------------------------------


In [None]:
from google.colab import files
!zip -r /content/English_to_Kalamang_model.zip /content/English_to_Kalamang_model
files.download('/content/English_to_Kalamang_model.zip')
!zip -r /content/improved_model.zip /content/improved_model
files.download('/content/improved_model.zip')

  adding: content/English_to_Kalamang_model/ (stored 0%)
  adding: content/English_to_Kalamang_model/tokenizer_config.json (deflated 94%)
  adding: content/English_to_Kalamang_model/spiece.model (deflated 48%)
  adding: content/English_to_Kalamang_model/model.safetensors (deflated 13%)
  adding: content/English_to_Kalamang_model/special_tokens_map.json (deflated 85%)
  adding: content/English_to_Kalamang_model/config.json (deflated 62%)
  adding: content/English_to_Kalamang_model/generation_config.json (deflated 29%)
  adding: content/English_to_Kalamang_model/added_tokens.json (deflated 83%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: content/improved_model/ (stored 0%)
  adding: content/improved_model/checkpoint-32/ (stored 0%)
  adding: content/improved_model/checkpoint-32/tokenizer_config.json (deflated 94%)
  adding: content/improved_model/checkpoint-32/spiece.model (deflated 48%)
  adding: content/improved_model/checkpoint-32/rng_state.pth (deflated 25%)
  adding: content/improved_model/checkpoint-32/optimizer.pt (deflated 6%)
  adding: content/improved_model/checkpoint-32/model.safetensors (deflated 13%)
  adding: content/improved_model/checkpoint-32/trainer_state.json (deflated 79%)
  adding: content/improved_model/checkpoint-32/training_args.bin (deflated 51%)
  adding: content/improved_model/checkpoint-32/special_tokens_map.json (deflated 85%)
  adding: content/improved_model/checkpoint-32/config.json (deflated 62%)
  adding: content/improved_model/checkpoint-32/generation_config.json (deflated 29%)
  adding: content/improved_model/checkpoint-32/scheduler.pt (deflated 56%)
  adding: content/improv

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Model 2

## T5-small

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="huggyllama/llama-30b",
    source_col="Kalamang_Sentence",
    target_col="English_Translation",
    direction="Kalamang to English",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/50.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/9.82G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/9.87G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
from google.colab import files
!zip -r /content/results_English_Translation_to_Kalamang_Sentence.zip /content/results_English_Translation_to_Kalamang_Sentence
files.download('/content/results_English_Translation_to_Kalamang_Sentence.zip')
!zip -r /content/improved_model.zip /content/improved_model
files.download('/content/improved_model.zip')

In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="t5-small",
    source_col="English_Translation",
    target_col="Kalamang_Sentence",
    direction="English to Kalamang",
)

In [None]:
from google.colab import files
!zip -r /content/results_English_Translation_to_Kalamang_Sentence.zip /content/results_English_Translation_to_Kalamang_Sentence
files.download('/content/results_English_Translation_to_Kalamang_Sentence.zip')
!zip -r /content/improved_model.zip /content/improved_model
files.download('/content/improved_model.zip')

### Model 3

## GPT2

In [None]:
import pandas as pd
from torch.utils.data import Dataset
from sacrebleu.metrics import CHRF
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch


class TranslationDataset(Dataset):
    def __init__(self, source_texts, target_texts, tokenizer, max_len=128):
        self.source_texts = source_texts
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.source_texts)

    def __getitem__(self, idx):
        source = self.source_texts[idx]
        target = self.target_texts[idx]

        # Tokenize source and target texts
        source_enc = self.tokenizer(
            source,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        target_enc = self.tokenizer(
            target,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        labels = target_enc.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100  # Ignore padding for loss computation

        return {
            "input_ids": source_enc.input_ids.squeeze(),
            "attention_mask": source_enc.attention_mask.squeeze(),
            "labels": labels,
        }


def train_and_evaluate_translation(
    train_data_path, test_data_path, model_name, source_col, target_col, direction, max_len=128, num_epochs=5
):

    # Load datasets
    train_data = pd.read_csv(train_data_path)
    test_data = pd.read_csv(test_data_path)

    # Tokenizer and model initialization
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Use eos_token as pad_token
    tokenizer.pad_token = tokenizer.eos_token

    # Prepare datasets
    train_dataset = TranslationDataset(
        train_data[source_col].tolist(), train_data[target_col].tolist(), tokenizer, max_len
    )
    test_dataset = TranslationDataset(
        test_data[source_col].tolist(), test_data[target_col].tolist(), tokenizer, max_len
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"./{direction.replace(' ', '_')}_model",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=2,  # Reduced batch size for memory efficiency
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=8,  # Simulate larger batch size
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        save_total_limit=2,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        fp16=True,  # Mixed precision for faster training
    )

    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    # Train the model
    print(f"\nTraining {direction}...")
    trainer.train()

    # Save the model
    model.save_pretrained(f"./{direction.replace(' ', '_')}_model")
    tokenizer.save_pretrained(f"./{direction.replace(' ', '_')}_model")

    # Evaluation function
    def evaluate_model(test_dataset, model, tokenizer):
        predictions = []
        references = []

        model.eval()
        for i in range(len(test_dataset)):
            item = test_dataset[i]
            input_ids = item["input_ids"].unsqueeze(0).to(model.device)
            attention_mask = item["attention_mask"].unsqueeze(0).to(model.device)

            with torch.no_grad():
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=50,  # Limit the number of new tokens generated
                    pad_token_id=tokenizer.pad_token_id,  # Ensure proper handling of padding
                )
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

                # Flatten labels and decode
                labels = item["labels"].tolist()
                labels = [token for token in labels if token != -100]
                reference = tokenizer.decode(labels, skip_special_tokens=True)

            predictions.append(prediction)
            references.append(reference)

        return predictions, references

    # Evaluate the model
    predictions, references = evaluate_model(test_dataset, model, tokenizer)

    # Compute ChrF score
    chrf = CHRF()
    formatted_references = [[ref] for ref in references]
    chrf_score = chrf.corpus_score(predictions, formatted_references)

    # Display results
    print(f"\nEvaluation for {direction}:")
    print(f"ChrF Score: {chrf_score.score:.2f}")
    for pred, ref in zip(predictions[:5], references[:5]):
        print(f"Prediction: {pred}")
        print(f"Reference: {ref}")
        print("-" * 50)


In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="gpt2",
    source_col="Kalamang_Sentence",
    target_col="English_Translation",
    direction="Kalamang to English",
)

  trainer = Trainer(



Training Kalamang to English...


Epoch,Training Loss,Validation Loss
0,6.3987,6.362756
1,6.0638,6.163359
2,5.7609,6.077036
3,5.7355,6.040586
4,5.5264,6.031734



Evaluation for Kalamang to English:
ChrF Score: 12.06
Prediction: Aisa ma yuotpanoi anggonggon...................................................
Reference: Aisa orderd snails from me.
--------------------------------------------------
Prediction: Binkur esun bisa erat kies, bisa pawan weinun...................................................
Reference: Binkur's father can carve a canoe, and planks too.
--------------------------------------------------
Prediction: Manadu gierun ningda koliep rua...................................................
Reference: Manadu has a toothache, his cheek is swollen.
--------------------------------------------------
Prediction: Suci sor paruo reba...................................................
Reference: Suci is preparing fish.
--------------------------------------------------
Prediction: Ka tok kokoat arteǃ..................................................
Reference: Expel the chicken first!
--------------------------------------------------


In [None]:
from google.colab import files
!zip -r /content/Kalamang_to_English_model.zip /content/Kalamang_to_English_model
files.download('/content/Kalamang_to_English_model.zip')
!zip -r /content/improved_model.zip /content/improved_model
files.download('/content/improved_model.zip')

  adding: content/Kalamang_to_English_model/ (stored 0%)
  adding: content/Kalamang_to_English_model/checkpoint-388/ (stored 0%)
  adding: content/Kalamang_to_English_model/checkpoint-388/tokenizer_config.json (deflated 55%)
  adding: content/Kalamang_to_English_model/checkpoint-388/vocab.json (deflated 68%)
  adding: content/Kalamang_to_English_model/checkpoint-388/rng_state.pth (deflated 25%)
  adding: content/Kalamang_to_English_model/checkpoint-388/optimizer.pt (deflated 8%)
  adding: content/Kalamang_to_English_model/checkpoint-388/model.safetensors (deflated 7%)
  adding: content/Kalamang_to_English_model/checkpoint-388/trainer_state.json (deflated 77%)
  adding: content/Kalamang_to_English_model/checkpoint-388/training_args.bin (deflated 51%)
  adding: content/Kalamang_to_English_model/checkpoint-388/special_tokens_map.json (deflated 74%)
  adding: content/Kalamang_to_English_model/checkpoint-388/config.json (deflated 52%)
  adding: content/Kalamang_to_English_model/checkpoint-3

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: content/improved_model/ (stored 0%)
  adding: content/improved_model/checkpoint-32/ (stored 0%)
  adding: content/improved_model/checkpoint-32/tokenizer_config.json (deflated 55%)
  adding: content/improved_model/checkpoint-32/vocab.json (deflated 68%)
  adding: content/improved_model/checkpoint-32/rng_state.pth (deflated 25%)
  adding: content/improved_model/checkpoint-32/optimizer.pt (deflated 7%)
  adding: content/improved_model/checkpoint-32/model.safetensors (deflated 7%)
  adding: content/improved_model/checkpoint-32/trainer_state.json (deflated 79%)
  adding: content/improved_model/checkpoint-32/training_args.bin (deflated 51%)
  adding: content/improved_model/checkpoint-32/special_tokens_map.json (deflated 74%)
  adding: content/improved_model/checkpoint-32/config.json (deflated 52%)
  adding: content/improved_model/checkpoint-32/generation_config.json (deflated 24%)
  adding: content/improved_model/checkpoint-32/merges.txt (deflated 53%)
  adding: content/improved_mo

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train_and_evaluate_translation(
    train_data_path="train_set.csv",
    test_data_path="test_set.csv",
    model_name="gpt2",
    source_col="English_Translation",
    target_col="Kalamang_Sentence",
    direction="English to Kalamang",
)

  trainer = Trainer(



Training English to Kalamang...


Epoch,Training Loss,Validation Loss
0,5.9289,5.835569
1,5.5517,5.577048
2,5.4011,5.480633
3,5.4052,5.437302
4,5.2555,5.422255



Evaluation for English to Kalamang:
ChrF Score: 10.23
Prediction: Aisa orderd snails from me...................................................
Reference: Aisa ma yuotpanoi anggonggon.
--------------------------------------------------
Prediction: Binkur's father can carve a canoe, and planks too...................................................
Reference: Binkur esun bisa erat kies, bisa pawan weinun.
--------------------------------------------------
Prediction: Manadu has a toothache, his cheek is swollen...................................................
Reference: Manadu gierun ningda koliep rua.
--------------------------------------------------
Prediction: Suci is preparing fish...................................................
Reference: Suci sor paruo reba.
--------------------------------------------------
Prediction: Expel the chicken first!..................................................
Reference: Ka tok kokoat arteǃ
--------------------------------------------------


In [None]:
from google.colab import files
!zip -r /content/English_to_Kalamang_model.zip /content/English_to_Kalamang_model
files.download('/content/English_to_Kalamang_model.zip')
!zip -r /content/improved_model.zip /content/improved_model
files.download('/content/improved_model.zip')

  adding: content/English_to_Kalamang_model/ (stored 0%)
  adding: content/English_to_Kalamang_model/checkpoint-388/ (stored 0%)
  adding: content/English_to_Kalamang_model/checkpoint-388/tokenizer_config.json (deflated 55%)
  adding: content/English_to_Kalamang_model/checkpoint-388/vocab.json (deflated 68%)
  adding: content/English_to_Kalamang_model/checkpoint-388/rng_state.pth (deflated 25%)
  adding: content/English_to_Kalamang_model/checkpoint-388/optimizer.pt (deflated 8%)
  adding: content/English_to_Kalamang_model/checkpoint-388/model.safetensors (deflated 7%)
  adding: content/English_to_Kalamang_model/checkpoint-388/trainer_state.json (deflated 76%)
  adding: content/English_to_Kalamang_model/checkpoint-388/training_args.bin (deflated 51%)
  adding: content/English_to_Kalamang_model/checkpoint-388/special_tokens_map.json (deflated 74%)
  adding: content/English_to_Kalamang_model/checkpoint-388/config.json (deflated 52%)
  adding: content/English_to_Kalamang_model/checkpoint-3

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

updating: content/improved_model/ (stored 0%)
updating: content/improved_model/checkpoint-32/ (stored 0%)
updating: content/improved_model/checkpoint-32/tokenizer_config.json (deflated 55%)
updating: content/improved_model/checkpoint-32/vocab.json (deflated 68%)
updating: content/improved_model/checkpoint-32/rng_state.pth (deflated 25%)
updating: content/improved_model/checkpoint-32/optimizer.pt (deflated 7%)
updating: content/improved_model/checkpoint-32/model.safetensors (deflated 7%)
updating: content/improved_model/checkpoint-32/trainer_state.json (deflated 79%)
updating: content/improved_model/checkpoint-32/training_args.bin (deflated 51%)
updating: content/improved_model/checkpoint-32/special_tokens_map.json (deflated 74%)
updating: content/improved_model/checkpoint-32/config.json (deflated 52%)
updating: content/improved_model/checkpoint-32/generation_config.json (deflated 24%)
updating: content/improved_model/checkpoint-32/merges.txt (deflated 53%)
updating: content/improved_mo

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>