In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!pip install transformers datasets torch sentencepiece google-cloud-storage accelerate



In [None]:
from google.cloud import storage

# Initialize GCS client
client = storage.Client()

bucket_name = 'translation-datasets'
train_file_path = f'gs://{bucket_name}/train.jsonl'
validation_file_path = f'gs://{bucket_name}/validation.jsonl'
test_file_path = f'gs://{bucket_name}/test.jsonl'

In [None]:
from datasets import load_dataset

# Load datasets from GCS bucket
train_data = load_dataset('json', data_files={'train': train_file_path})['train']
validation_data = load_dataset('json', data_files={'validation': validation_file_path})['validation']
test_data = load_dataset('json', data_files={'test': test_file_path})['test']

# Verify loaded data
print(train_data[0])


{'source_lang': 'en', 'target_lang': 'fr', 'source_text': '"Wait: I have your name here.', 'target_text': "—Attendez, j'ai la votre nom."}


In [None]:
from transformers import AutoTokenizer

# Load the NLLB tokenizer
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Language code mapping for NLLB
lang_code_map = {
    "fr": "fra_Latn",
    "en": "eng_Latn",
    "es": "spa_Latn",
    "de": "deu_Latn",
    "it": "ita_Latn",
    "pt": "por_Latn",
    "zh": "zho_Hans",
    "ja": "jpn_Jpan",
    "ar": "arb_Arab",
    "hi": "hin_Deva",
    "ru": "rus_Cyrl",
    "ko": "kor_Hang",
    "tr": "tur_Latn",
    "nl": "nld_Latn",
    "sv": "swe_Latn",
    "pl": "pol_Latn",
} 




In [None]:
# Check the column names of the dataset
print(train_data.column_names)
print(train_data[0])

['source_lang', 'target_lang', 'source_text', 'target_text']
{'source_lang': 'en', 'target_lang': 'fr', 'source_text': '"Wait: I have your name here.', 'target_text': "—Attendez, j'ai la votre nom."}


In [None]:
def preprocess_function(examples):

    tokenizer.src_lang = examples['source_lang'][0]
    tokenizer.tgt_lang = examples['target_lang'][0]
    
    # Extract source and target sentences
    inputs = examples['source_text']
    targets = examples['target_text']
    
    # Tokenize the source and target texts with padding and truncation
    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        max_length=128,
        padding=True,
        truncation=True
    )
    return model_inputs


In [7]:
# Apply the preprocessing to the datasets
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_validation = validation_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)


In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import Seq2SeqTrainingArguments

# Define training arguments with a matching save and evaluation strategy
training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb-finetuned",
    eval_strategy="steps",  # Evaluate at the end of each epoch
    save_strategy="steps",  # Save based on steps
    save_steps=3600,  # Save a checkpoint every 3600 steps (approx 1 hour)
    eval_steps=3600,  # Evaluate every 3600 steps (matches save_steps)
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=4,   # Reduce evaluation batch size
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps to simulate batch size of 16
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,  # Enable mixed precision for faster training on supported GPUs
    logging_dir='./logs',
    load_best_model_at_end=True  # Ensure best model is loaded at the end
)




In [None]:
from transformers import TrainerCallback
import os
import subprocess

class UploadCheckpointToGCS(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Path to the local checkpoint directory
        checkpoint_dir = f"{args.output_dir}/checkpoint-{state.global_step}"
        
        # Upload to GCS bucket
        bucket_name = "translation-datasets"
        destination = f"gs://{bucket_name}/nllb-finetuned/checkpoint-{state.global_step}"
        
        # Run the gsutil command to upload the checkpoint to GCS
        subprocess.run(f"gsutil cp -r {checkpoint_dir} {destination}", shell=True)
        print(f"Uploaded checkpoint {state.global_step} to {destination}")


In [None]:
from transformers import Seq2SeqTrainer, AutoModelForSeq2SeqLM, EarlyStoppingCallback, DataCollatorWithPadding

# Define the data collator to handle padding dynamically
data_collator = DataCollatorWithPadding(tokenizer)


# Load the NLLB model
model_name = "facebook/nllb-200-distilled-600M"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2), UploadCheckpointToGCS()]
)



  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
import torch
torch.cuda.empty_cache()

trainer.train()




Step,Training Loss,Validation Loss
3600,0.4632,0.442232


Copying file://./nllb-finetuned/checkpoint-3600/optimizer.pt [Content-Type=application/vnd.snesdev-page-table]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://./nllb-finetuned/checkpoint-3600/model.safetensors [Content-Type=application/octet-stream]...
Copying file://./nllb-finetuned/checkpoint-3600/trainer_state.json [Content-Type=applicatio

Uploaded checkpoint 3600 to gs://vosyncore-translation-datasets/nllb-finetuned/checkpoint-3600




In [None]:
# Upload to GCS
!gsutil cp -r ./nllb-finetuned gs://{bucket_name}/nllb-finetuned

# push to Hugging Face Hub
#model.push_to_hub("my-finetuned-nllb-model")
#tokenizer.push_to_hub("my-finetuned-nllb-model")


In [None]:
# Evaluate the model on the test dataset
eval_results = trainer.evaluate(eval_dataset=tokenized_test)

print(eval_results)




{'eval_loss': 11.062021255493164, 'eval_runtime': 1882.0286, 'eval_samples_per_second': 17.923, 'eval_steps_per_second': 2.241}
