## Finetune Pretrained Models to Translate Low Resource Language - Kalamang 

In [1]:
!pip install datasets



In [3]:
!pip install evaluate



In [4]:
!pip install sacrebleu rouge_score

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9ef5550efcec858491da23877e8d46678ac9ded970345706e9bb3016533fd00a
  Stored in directory: /root/.cache/pip/wheels/

In [5]:
!pip install wandb



In [5]:
import os



# Disable Weights & Biases logging

os.environ["WANDB_DISABLED"] = "true"

## Helsinki-NLP/opus-mt-en-mul Model

In [6]:
from datasets import load_dataset

from transformers import (

    AutoTokenizer,

    AutoModelForSeq2SeqLM,

    Seq2SeqTrainer,

    Seq2SeqTrainingArguments,

    GenerationConfig,

)

from evaluate import load

from accelerate import Accelerator

### Fine Tuning model on a dataset of Kalamang sentences paired with their English translations 

In [None]:
# Function to process and fine-tune the model for translation

def train_and_evaluate_translation(model_name, train_file, test_file, source_lang, target_lang):

    """

    Trains and evaluates a translation model.



    Args:

        model_name (str): Hugging Face model name or path.

        train_file (str): Path to the training dataset.

        test_file (str): Path to the test dataset.

        source_lang (str): Source language column name in the dataset.

        target_lang (str): Target language column name in the dataset.

    Returns:

        dict: Evaluation results including BLEU and ROUGE scores.

    """

    # Load dataset

    data_files = {"train": train_file, "test": test_file}

    dataset = load_dataset("csv", data_files=data_files, column_names=[source_lang, target_lang])



    # Load tokenizer and model

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



    # Configure model

    model.config.bos_token_id = 1

    model.config.eos_token_id = 0

    model.config.decoder_start_token_id = model.config.bos_token_id



    # Set generation configuration

    generation_config = GenerationConfig(

        bos_token_id=model.config.bos_token_id,

        eos_token_id=model.config.eos_token_id,

        decoder_start_token_id=model.config.decoder_start_token_id,

        max_length=128,

        num_beams=5,

    )

    model.generation_config = generation_config



    # Ensure compatibility between tokenizer and model

    model.resize_token_embeddings(len(tokenizer))



    # Preprocessing function

    def preprocess_function(examples):

        inputs = examples[source_lang]

        targets = examples[target_lang]

        model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

        with tokenizer.as_target_tokenizer():

            labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

        model_inputs["labels"] = labels["input_ids"]

        return model_inputs



    # Preprocess datasets

    tokenized_datasets = dataset.map(preprocess_function, batched=True)



    # Define training arguments

    training_args = Seq2SeqTrainingArguments(

        output_dir=f"./results_{source_lang}_to_{target_lang}",

        eval_strategy="epoch",

        learning_rate=3e-5,

        per_device_train_batch_size=8,

        per_device_eval_batch_size=8,

        weight_decay=0.01,

        save_total_limit=3,

        num_train_epochs=7,

        predict_with_generate=True,

        fp16=True,

        report_to="none",

        save_strategy="epoch",

        load_best_model_at_end=True,

        metric_for_best_model="eval_loss",

        greater_is_better=False,

    )



    # Load metrics

    chrf = load("chrf")

    rouge = load("rouge")



    # Compute metrics

    def compute_metrics(eval_preds):

        preds, labels = eval_preds

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        chrf_score = chrf.compute(predictions=decoded_preds, references=decoded_labels)

        rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)

        return {

            "chrf": chrf_score["score"],

            "rouge1": rouge_score["rouge1"],

            "rouge2": rouge_score["rouge2"],

            "rougeL": rouge_score["rougeL"],

            "rougeLsum": rouge_score["rougeLsum"],

        }



    # Initialize trainer

    trainer = Seq2SeqTrainer(

        model=model,

        args=training_args,

        train_dataset=tokenized_datasets["train"],

        eval_dataset=tokenized_datasets["test"],

        tokenizer=tokenizer,

        compute_metrics=compute_metrics,

    )



    # Train the model

    trainer.train()



    # Evaluate the model

    eval_results = trainer.evaluate()

    print(f"Evaluation Results for {source_lang} to {target_lang}:", eval_results)



    return eval_results


In [None]:
# Train and evaluate for both directions

if __name__ == "__main__":

    model_name = "Helsinki-NLP/opus-mt-en-mul"

    train_file = "/content/train_set.csv"  # Replace with your training dataset

    test_file = "/content/test_set.csv"    # Replace with your test dataset



    # Kalamang to English

    print("\nTraining Kalamang to English...")

    kalamang_to_english_results = train_and_evaluate_translation(

        model_name=model_name,

        train_file=train_file,

        test_file=test_file,

        source_lang="Kalamang_Sentence",

        target_lang="English_Translation",

    )



    # English to Kalamang

    print("\nTraining English to Kalamang...")

    english_to_kalamang_results = train_and_evaluate_translation(

        model_name=model_name,

        train_file=train_file,

        test_file=test_file,

        source_lang="English_Translation",

        target_lang="Kalamang_Sentence",

    )


Training Kalamang to English...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.42M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Map:   0%|          | 0/1555 [00:00<?, ? examples/s]



model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Chrf,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.418383,24.612567,0.206977,0.09987,0.199943,0.199919
2,No log,0.375901,27.314634,0.237758,0.111469,0.229961,0.229274
3,0.642000,0.357299,27.972696,0.249792,0.126537,0.244996,0.244753
4,0.642000,0.347178,28.503615,0.265129,0.136779,0.255735,0.254716
5,0.642000,0.33902,30.446632,0.30972,0.167584,0.291415,0.291545
6,0.303500,0.339351,31.034464,0.313883,0.164129,0.302885,0.300711
7,0.303500,0.339217,31.100724,0.310037,0.162159,0.294224,0.293928


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.token

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead

Evaluation Results for Kalamang_Sentence to English_Translation: {'eval_loss': 0.3390199542045593, 'eval_chrf': 30.44663203675016, 'eval_rouge1': 0.3097198927159652, 'eval_rouge2': 0.16758408308213973, 'eval_rougeL': 0.2914148735745238, 'eval_rougeLsum': 0.29154470037156915, 'eval_runtime': 8.3749, 'eval_samples_per_second': 12.06, 'eval_steps_per_second': 1.552, 'epoch': 7.0}

Training English to Kalamang...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]



Map:   0%|          | 0/1555 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Chrf,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.418383,24.612567,0.206615,0.099927,0.198901,0.198899
2,No log,0.375901,27.314634,0.237954,0.112002,0.229225,0.22998
3,0.642000,0.357299,27.972696,0.249987,0.125493,0.244509,0.244656
4,0.642000,0.347178,28.503615,0.264968,0.136214,0.254537,0.255961
5,0.642000,0.33902,30.446632,0.309342,0.168065,0.290349,0.290619
6,0.303500,0.339351,31.034464,0.313014,0.165133,0.301519,0.301354
7,0.303500,0.339217,31.100724,0.310434,0.163453,0.294229,0.293497


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.token

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead

Evaluation Results for English_Translation to Kalamang_Sentence: {'eval_loss': 0.3390199542045593, 'eval_chrf': 30.44663203675016, 'eval_rouge1': 0.3093420060703155, 'eval_rouge2': 0.16806450843859133, 'eval_rougeL': 0.2903490728225169, 'eval_rougeLsum': 0.2906190908507966, 'eval_runtime': 8.451, 'eval_samples_per_second': 11.951, 'eval_steps_per_second': 1.538, 'epoch': 7.0}


## google-t5/t5-base Model

In [None]:
# Train and evaluate for both directions

if __name__ == "__main__":

    model_name = "google-t5/t5-base"

    train_file = "/content/train_set.csv"  # Replace with your training dataset

    test_file = "/content/test_set.csv"    # Replace with your test dataset



    # Kalamang to English

    print("\nTraining Kalamang to English...")

    kalamang_to_english_results = train_and_evaluate_translation(

        model_name=model_name,

        train_file=train_file,

        test_file=test_file,

        source_lang="Kalamang_Sentence",

        target_lang="English_Translation",

    )



    # English to Kalamang

    print("\nTraining English to Kalamang...")

    english_to_kalamang_results = train_and_evaluate_translation(

        model_name=model_name,

        train_file=train_file,

        test_file=test_file,

        source_lang="English_Translation",

        target_lang="Kalamang_Sentence",

    )


Training Kalamang to English...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/1555 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Chrf,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.363448,12.067063,0.146763,0.052901,0.144302,0.143451
2,No log,0.321196,8.48851,0.060546,0.019822,0.060967,0.060768
3,1.001800,0.305968,17.735179,0.150183,0.056808,0.1434,0.144238
4,1.001800,0.297659,22.551086,0.178133,0.074867,0.173924,0.174569
5,1.001800,0.292621,22.665518,0.195835,0.071102,0.188331,0.189037
6,0.312500,0.290358,22.647675,0.186626,0.074859,0.181178,0.181819
7,0.312500,0.289458,22.494824,0.18907,0.076983,0.182584,0.183742


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_toke

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.token

Evaluation Results for Kalamang_Sentence to English_Translation: {'eval_loss': 0.2894580662250519, 'eval_chrf': 22.49482387148101, 'eval_rouge1': 0.18907023666141748, 'eval_rouge2': 0.07698317360151036, 'eval_rougeL': 0.18258350579225507, 'eval_rougeLsum': 0.18374196987050517, 'eval_runtime': 27.1385, 'eval_samples_per_second': 3.722, 'eval_steps_per_second': 0.479, 'epoch': 7.0}

Training English to Kalamang...


Map:   0%|          | 0/1555 [00:00<?, ? examples/s]



Map:   0%|          | 0/101 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Chrf,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.363448,12.067063,0.146763,0.052901,0.144302,0.143451
2,No log,0.321196,8.48851,0.060546,0.019822,0.060967,0.060768
3,1.001800,0.305968,17.735179,0.150183,0.056808,0.1434,0.144238
4,1.001800,0.297659,22.551086,0.178133,0.074867,0.173924,0.174569
5,1.001800,0.292621,22.665518,0.195835,0.071102,0.188331,0.189037
6,0.312500,0.290358,22.647675,0.186626,0.074859,0.181178,0.181819
7,0.312500,0.289458,22.494824,0.18907,0.076983,0.182584,0.183742


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_toke

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.token

Evaluation Results for English_Translation to Kalamang_Sentence: {'eval_loss': 0.2894580662250519, 'eval_chrf': 22.49482387148101, 'eval_rouge1': 0.18907023666141748, 'eval_rouge2': 0.07698317360151036, 'eval_rougeL': 0.18258350579225507, 'eval_rougeLsum': 0.18374196987050517, 'eval_runtime': 26.7013, 'eval_samples_per_second': 3.783, 'eval_steps_per_second': 0.487, 'epoch': 7.0}


In [7]:
from google.colab import files

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!zip -r /content/results_English_Translation_to_Kalamang_Sentence.zip /content/results_English_Translation_to_Kalamang_Sentence

files.download('/content/results_English_Translation_to_Kalamang_Sentence.zip')

  adding: content/results_English_Translation_to_Kalamang_Sentence/ (stored 0%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/ (stored 0%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/tokenizer_config.json (deflated 95%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/spiece.model (deflated 48%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/vocab.json (deflated 70%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/rng_state.pth (deflated 25%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/optimizer.pt (deflated 8%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/model.safetensors (deflated 8%)
  adding: content/results_English_Translation_to_Kalamang_Sentence/checkpoint-1170/trainer_state.json (deflated 68%)
  adding: content/results_Engl

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r /content/results_Kalamang_Sentence_to_English_Translation.zip /content/results_Kalamang_Sentence_to_English_Translation

files.download('/content/results_Kalamang_Sentence_to_English_Translation.zip')

  adding: content/results_English_Translation_to_Kalamang_Sentence.zip (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Finetuning Model on Grammar Text and Word Dictionary

In [8]:
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling,
    AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
)
from datasets import Dataset
import json
import pandas as pd
import torch

In [9]:
from evaluate import load

from accelerate import Accelerator

### Fine Tuning

In [10]:
def fine_tune_on_text(text_file, model_name, output_dir, max_len=128, num_epochs=3, batch_size=4, lr=5e-5):

    """

    Fine-tunes a model on a grammar text file.

    """

    # Load tokenizer and model

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



    # Add a padding token if missing

    if tokenizer.pad_token is None:

        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

        model.resize_token_embeddings(len(tokenizer))



    # Load text file

    with open(text_file, "r", encoding="utf-8") as f:

        lines = f.readlines()



    # # Prepare dataset

    # dataset = Dataset.from_dict({"text": lines})

    # Add prefixes for grammar fine-tuning

    dataset = Dataset.from_dict({"text": [f"grammar: {line.strip()}" for line in lines]})



    # def tokenize_function(examples):

    #     return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_len)



    def tokenize_function(examples):

        inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_len)

        labels = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_len)

        inputs["labels"] = labels["input_ids"]

        return inputs



    tokenized_dataset = dataset.map(tokenize_function, batched=True)



    # # Data collator

    # data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Data collator

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



    # Training arguments

    training_args = TrainingArguments(

        output_dir=output_dir,

        overwrite_output_dir=True,

        num_train_epochs=num_epochs,

        per_device_train_batch_size=batch_size,

        save_steps=500,

        save_total_limit=2,

        evaluation_strategy="no",

        logging_steps=100,

        learning_rate=lr,

        weight_decay=0.01,

        lr_scheduler_type="linear",

        warmup_steps=500,

        fp16=True,

        logging_dir=f"{output_dir}/logs",

    )



    # Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=tokenized_dataset,

        data_collator=data_collator,

    )



    # Train the model

    print("Fine-tuning on grammar text...")

    trainer.train()



    # Save the model

    model.save_pretrained(output_dir)

    tokenizer.save_pretrained(output_dir)



    print(f"Model fine-tuned on grammar text and saved to {output_dir}")

    return tokenizer, model

In [11]:
def merge_json_files(json_file_1, json_file_2):

    """

    Merges two JSON files of different formats into a single dataset.

    """

    combined_translations = []



    # Process JSON file 1 (type 1)

    with open(json_file_1, "r", encoding="utf-8") as f:

        data_1 = json.load(f)

    key_values_1 = data_1.get("ke", {})

    for key, values in key_values_1.items():

        if isinstance(values, list) and len(values) > 1:

            target = ", ".join(values[1:])

            combined_translations.append({"source": key, "target": target})

            combined_translations.append({"source": target, "target": key})  # Reverse mapping for augmentation



    # Process JSON file 2 (type 2)

    with open(json_file_2, "r", encoding="utf-8") as f:

        data_2 = json.load(f)

    key_values_2 = data_2.get("ek", {})

    for key, value in key_values_2.items():

        combined_translations.append({"source": key, "target": value})

        combined_translations.append({"source": value, "target": key})  # Reverse mapping for augmentation



    return Dataset.from_pandas(pd.DataFrame(combined_translations))

In [17]:
def fine_tune_on_json(json_file_1, json_file_2, tokenizer, model, output_dir, max_len=128, num_epochs=2, batch_size=4, lr=5e-5):

    """

    Fine-tunes the model on a combined dataset created by merging two JSON files.

    """

    # Merge JSON files

    dataset = merge_json_files(json_file_1, json_file_2)



    # Add prefixes for translation

    dataset = dataset.map(lambda x: {"source": f"translate: {x['source']}"}, batched=False)



    # Tokenize the dataset

    def preprocess_function(examples):

        model_inputs = tokenizer(examples["source"], max_length=max_len, truncation=True, padding="max_length")

        labels = tokenizer(examples["target"], max_length=max_len, truncation=True, padding="max_length")

        model_inputs["labels"] = labels["input_ids"]

        return model_inputs



    tokenized_dataset = dataset.map(preprocess_function, batched=True)



    # Data collator

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)



    # Training arguments

    training_args = TrainingArguments(

        output_dir=output_dir,

        overwrite_output_dir=True,

        num_train_epochs=num_epochs,

        per_device_train_batch_size=batch_size,

        save_steps=500,

        save_total_limit=2,

        evaluation_strategy="no",

        logging_steps=100,

        learning_rate=lr,

        weight_decay=0.01,

        lr_scheduler_type="cosine",

        warmup_steps=500,

        fp16=True,

        logging_dir=f"{output_dir}/logs",

    )



    # Trainer

    trainer = Trainer(

        model=model,

        args=training_args,

        train_dataset=tokenized_dataset,

        data_collator=data_collator,

    )



    # Train the model

    print("Fine-tuning on merged JSON dataset...")

    trainer.train()



    # Save the model

    model.save_pretrained(output_dir)

    tokenizer.save_pretrained(output_dir)



    print(f"Model further fine-tuned on JSON data and saved to {output_dir}")

    return tokenizer, model

### Evaluation

In [13]:
def evaluate_model(test_file, tokenizer, model, source_lang, target_lang, max_len=128):

    """

    Evaluates the fine-tuned model on English-to-Kalamang and Kalamang-to-English sentences using CHRF.

    """



    # Load test dataset

    test_data = pd.read_csv(test_file)



    # Prepare dataset

    dataset = Dataset.from_pandas(test_data)



    def preprocess_function(examples):

        inputs = tokenizer(

            examples[source_lang], truncation=True, padding="max_length", max_length=max_len      #, return_tensors="pt"

        )

        labels = tokenizer(

            examples[target_lang], truncation=True, padding="max_length", max_length=max_len      #, return_tensors="pt"

        )

        return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels["input_ids"]}



    tokenized_dataset = dataset.map(preprocess_function, batched=True)



    # # Generate predictions

    # predictions, references = [], []

    # for example in tokenized_dataset:

    #     # Convert input_ids and attention_mask to tensors

    #     input_ids = torch.tensor(example["input_ids"]).to(model.device)

    #     attention_mask = torch.tensor(example["attention_mask"]).to(model.device)



    #     with torch.no_grad():

    #         # Generate predictions

    #         outputs = model.generate(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0), max_length=max_len)

    #     prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)



    #     # Decode references

    #     labels = torch.tensor(example["labels"]).tolist()

    #     reference = tokenizer.decode([label for label in labels if label != -100], skip_special_tokens=True)



    #     predictions.append(prediction)

    #     references.append(reference)



    # Generate predictions

    predictions, references = [], []

    for example in tokenized_dataset:

        input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(model.device)

        with torch.no_grad():

            outputs = model.generate(input_ids, max_length=max_len)

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        reference = tokenizer.decode(example["labels"], skip_special_tokens=True)



        predictions.append(prediction)

        references.append(reference)



    # Evaluate using CHRF

    chrf = load("chrf")

    chrf_score = chrf.compute(predictions=predictions, references=references)

    print(f"CHRF Score: {chrf_score['score']:.2f}")

## Helsinki-NLP/opus-mt-en-mul

In [11]:
# File paths

grammar_file = "/content/grammar_book.txt"

json_file_type_1 = "/content/wordlist.json"

json_file_type_2 = "/content/wordlist2.json"

test_file = "/content/test_set.csv"

model_name = "Helsinki-NLP/opus-mt-en-mul"

grammar_model_output = "/content/grammar_model"

json_model_output = "/content/json_model"

In [17]:
# Step 1: Fine-tune on grammar text

tokenizer, model = fine_tune_on_text(

    text_file=grammar_file,

    model_name=model_name,

    output_dir=grammar_model_output,

)



Map:   0%|          | 0/6766 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fine-tuning on grammar text...


Step,Training Loss
100,3.558
200,0.5999
300,0.366
400,0.2366
500,0.1666
600,0.1488
700,0.1186
800,0.1004
900,0.0925
1000,0.0758




Model fine-tuned on grammar text and saved to /content/grammar_model


In [18]:
# Step 2: Further fine-tune on merged JSON data

tokenizer, model = fine_tune_on_json(

    json_file_1=json_file_type_1,

    json_file_2=json_file_type_2,

    tokenizer=tokenizer,

    model=model,

    output_dir=json_model_output,

)

Map:   0%|          | 0/8926 [00:00<?, ? examples/s]

Map:   0%|          | 0/8926 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fine-tuning on merged JSON dataset...


Step,Training Loss
100,0.2768
200,0.1823
300,0.1664
400,0.1554
500,0.1582
600,0.1483
700,0.1463
800,0.1442
900,0.1485
1000,0.1355


Model further fine-tuned on JSON data and saved to /content/json_model


In [21]:
# Step 3: Evaluate the model

evaluate_model(

    test_file=test_file,

    tokenizer=tokenizer,

    model=model,

    source_lang="Kalamang_Sentence",

    target_lang="English_Translation"

)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

CHRF Score: 8.37


In [22]:
# Step 3: Evaluate the model

evaluate_model(

    test_file=test_file,

    tokenizer=tokenizer,

    model=model,

    source_lang="English_Translation",

    target_lang="Kalamang_Sentence"

)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

CHRF Score: 9.77


In [23]:
# Step 3: Evaluate the model

evaluate_model(

    test_file="/content/train_set.csv",

    tokenizer=tokenizer,

    model=model,

    source_lang="English_Translation",

    target_lang="Kalamang_Sentence"

)

Map:   0%|          | 0/1554 [00:00<?, ? examples/s]

CHRF Score: 8.76


In [24]:
# Step 3: Evaluate the model

evaluate_model(

    test_file="/content/train_set.csv",

    tokenizer=tokenizer,

    model=model,

    source_lang="Kalamang_Sentence",

    target_lang="English_Translation"

)

Map:   0%|          | 0/1554 [00:00<?, ? examples/s]

CHRF Score: 7.39


In [32]:
!zip -r /content/grammar_model.zip /content/grammar_model

files.download('/content/grammar_model.zip')

updating: content/grammar_model/ (stored 0%)
updating: content/grammar_model/logs/ (stored 0%)
updating: content/grammar_model/logs/events.out.tfevents.1731860795.0f3cf0dac446.575.0 (deflated 65%)
updating: content/grammar_model/tokenizer_config.json (deflated 68%)
updating: content/grammar_model/checkpoint-5076/ (stored 0%)
updating: content/grammar_model/checkpoint-5076/rng_state.pth (deflated 25%)
updating: content/grammar_model/checkpoint-5076/optimizer.pt (deflated 8%)
updating: content/grammar_model/checkpoint-5076/model.safetensors (deflated 8%)
updating: content/grammar_model/checkpoint-5076/trainer_state.json (deflated 75%)
updating: content/grammar_model/checkpoint-5076/training_args.bin (deflated 51%)
updating: content/grammar_model/checkpoint-5076/config.json (deflated 61%)
updating: content/grammar_model/checkpoint-5076/generation_config.json (deflated 43%)
updating: content/grammar_model/checkpoint-5076/scheduler.pt (deflated 55%)
updating: content/grammar_model/vocab.jso

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
!zip -r /content/json_model.zip /content/json_model

files.download('/content/json_model.zip')

  adding: content/json_model/ (stored 0%)
  adding: content/json_model/logs/ (stored 0%)
  adding: content/json_model/logs/events.out.tfevents.1731863533.0f3cf0dac446.575.1 (deflated 64%)
  adding: content/json_model/tokenizer_config.json (deflated 68%)
  adding: content/json_model/vocab.json (deflated 70%)
  adding: content/json_model/model.safetensors (deflated 8%)
  adding: content/json_model/target.spm (deflated 47%)
  adding: content/json_model/checkpoint-3348/ (stored 0%)
  adding: content/json_model/checkpoint-3348/rng_state.pth (deflated 25%)
  adding: content/json_model/checkpoint-3348/optimizer.pt (deflated 8%)
  adding: content/json_model/checkpoint-3348/model.safetensors (deflated 8%)
  adding: content/json_model/checkpoint-3348/trainer_state.json (deflated 74%)
  adding: content/json_model/checkpoint-3348/training_args.bin (deflated 51%)
  adding: content/json_model/checkpoint-3348/config.json (deflated 61%)
  adding: content/json_model/checkpoint-3348/generation_config.js

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## facebook/mbart-large-50-many-to-one-mmt

In [14]:
# File paths

grammar_file = "/kaggle/input/kalamang/grammar_book.txt"

json_file_type_1 = "/kaggle/input/kalamang/wordlist.json"

json_file_type_2 = "/kaggle/input/kalamang/wordlist2.json"

test_file = "/kaggle/input/kalamang/test_set.csv"

# model_name = "Helsinki-NLP/opus-mt-en-mul"

grammar_model_output = "/content/grammar_model"

json_model_output = "/content/json_model"

In [15]:
model_name = "facebook/mbart-large-50-many-to-one-mmt"

In [16]:
# Step 1: Fine-tune on grammar text

tokenizer, model = fine_tune_on_text(

    text_file=grammar_file,

    model_name=model_name,

    output_dir=grammar_model_output,

)

tokenizer_config.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/268 [00:00<?, ?B/s]

Map:   0%|          | 0/6766 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Fine-tuning on grammar text...


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111293853333311, max=1.0)…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
100,7.7636
200,2.7632
300,0.01
400,0.0049
500,0.0086
600,0.0103
700,0.007
800,0.01
900,0.0071
1000,0.0048


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Model fine-tuned on grammar text and saved to /content/grammar_model


In [18]:
# Step 2: Further fine-tune on merged JSON data

tokenizer, model = fine_tune_on_json(

    json_file_1=json_file_type_1,

    json_file_2=json_file_type_2,

    tokenizer=tokenizer,

    model=model,

    output_dir=json_model_output,

)

Map:   0%|          | 0/8926 [00:00<?, ? examples/s]

Map:   0%|          | 0/8926 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Fine-tuning on merged JSON dataset...


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
100,0.2182
200,0.1437
300,0.1323
400,0.1254
500,0.1242
600,0.1198
700,0.1274
800,0.1128
900,0.1132
1000,0.1018


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Model further fine-tuned on JSON data and saved to /content/json_model


In [19]:
# Step 3: Evaluate the model

evaluate_model(

    test_file=test_file,

    tokenizer=tokenizer,

    model=model,

    source_lang="Kalamang_Sentence",

    target_lang="English_Translation"

)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

CHRF Score: 11.57


In [20]:
# Step 3: Evaluate the model

evaluate_model(

    test_file=test_file,

    tokenizer=tokenizer,

    model=model,

    source_lang="English_Translation",

    target_lang="Kalamang_Sentence"

)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

CHRF Score: 12.18


In [21]:
# Step 3: Evaluate the model

evaluate_model(

    test_file="/kaggle/input/kalamang/train_set.csv",

    tokenizer=tokenizer,

    model=model,

    source_lang="Kalamang_Sentence",

    target_lang="English_Translation"

)

Map:   0%|          | 0/1554 [00:00<?, ? examples/s]

CHRF Score: 10.81


In [22]:
# Step 3: Evaluate the model

evaluate_model(

    test_file="/kaggle/input/kalamang/train_set.csv",

    tokenizer=tokenizer,

    model=model,

    source_lang="English_Translation",

    target_lang="Kalamang_Sentence"

)

Map:   0%|          | 0/1554 [00:00<?, ? examples/s]

CHRF Score: 11.85
