# Albanian Translator Training (Colab)\n
\n
This notebook runs the existing project training pipeline on Colab GPU and saves everything to Google Drive.

In [11]:
import os
from pathlib import Path

USE_DRIVE = False
PROJECT_DIR = "/content/Translator"
DATA_DIR = "data/alb_en"  # local dataset path inside project

if USE_DRIVE:
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        PROJECT_DIR = "/content/drive/MyDrive/Translator"
    except Exception as error:
        print("Drive mount failed, staying on local /content storage.")
        print(error)

Path(PROJECT_DIR).mkdir(parents=True, exist_ok=True)
print("PROJECT_DIR:", PROJECT_DIR)
print("DATA_DIR:", DATA_DIR)

PROJECT_DIR: /content/Translator
DATA_DIR: data/alb_en


### Storage mode

- Default is **local Colab storage** (`/content/Translator`) for simpler startup.
- Set `USE_DRIVE = True` in Cell 2 only if you want persistent checkpoints in Google Drive.
- `DATA_DIR` controls where training reads data from (default: `data/alb_en`).

## Clone or update project in Drive\n
Set your repo URL below, then run.

In [12]:
import os
import subprocess

REPO_URL = "https://github.com/GjergjBrestovci/Translator.git"  # set this first

if not os.path.exists(PROJECT_DIR):
    os.makedirs(PROJECT_DIR, exist_ok=True)

if os.path.exists(os.path.join(PROJECT_DIR, ".git")):
    print("Repo exists, pulling latest...")
    subprocess.run(["git", "-C", PROJECT_DIR, "pull"], check=False)
else:
    if "<your-user>" in REPO_URL or "<your-repo>" in REPO_URL:
        raise ValueError("Set REPO_URL to your actual GitHub repository before continuing.")
    print("Cloning repo...")
    subprocess.run(["git", "clone", REPO_URL, PROJECT_DIR], check=True)

script_path = os.path.join(PROJECT_DIR, "scripts", "train_albanian_to_english.py")
if not os.path.exists(script_path):
    raise FileNotFoundError(
        f"Missing {script_path}. Confirm REPO_URL points to this Translator project."
    )

print("Project ready at:", PROJECT_DIR)
print("Found training script:", script_path)

Cloning repo...
Project ready at: /content/Translator
Found training script: /content/Translator/scripts/train_albanian_to_english.py


In [13]:
%cd $PROJECT_DIR
!test -f scripts/train_albanian_to_english.py || (echo "Training script missing. Run the repo setup cell first." && exit 1)
!python -m pip install -U pip
!pip install -r requirements.txt

/content/Translator
Collecting evaluate>=0.4.0 (from -r requirements.txt (line 3))
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu>=2.4.0 (from -r requirements.txt (line 4))
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu>=2.4.0->-r requirements.txt (line 4))
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu>=2.4.0->-r requirements.txt (line 4))
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
Downloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu, evaluate
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4/4[0m [evaluate]3/4[0m [evaluate]
[1A[2KSuccessfully installed colorama-0.4.6 eval

## Optional: rebuild/expand dataset (rows-api stable mode)

In [14]:
%cd $PROJECT_DIR
!PYTHONUNBUFFERED=1 python scripts/prepare_dataset.py \
  --subsets aln_Latn als_Latn \
  --output-dir $DATA_DIR \
  --data-backend rows-api \
  --max-samples-per-subset 50000 \
  --rows-api-page-size 100 \
  --rows-api-retries 8 \
  --rows-api-retry-wait-seconds 2.0 \
  --rows-api-request-interval-seconds 0.15 \
  --drop-early-stop

/content/Translator
Streaming subset: aln_Latn
aln_Latn: 563it [00:05, 102.27it/s]
Streaming subset: als_Latn
als_Latn: 50000it [17:27, 47.74it/s] 
Dataset prepared:
{
  "subsets": [
    "aln_Latn",
    "als_Latn"
  ],
  "num_total": 50557,
  "num_train": 49545,
  "num_validation": 505,
  "num_test": 507,
  "data_backend": "rows-api",
  "max_samples_per_subset": 50000,
  "min_source_chars": 20,
  "drop_early_stop": true,
  "seed": 42
}


## Train (cool/stable defaults)

In [None]:
%cd $PROJECT_DIR

!PYTHONUNBUFFERED=1 OMP_NUM_THREADS=2 TOKENIZERS_PARALLELISM=false python scripts/train_albanian_to_english.py \

  --data-dir $DATA_DIR \

  --model-name Helsinki-NLP/opus-mt-sq-en \

  --output-dir outputs/opusmt-alb-en-colab \

  --num-train-epochs 1.0 \

  --per-device-train-batch-size 4 \

  --per-device-eval-batch-size 4 \

  --gradient-accumulation-steps 4 \

  --eval-steps 1500 \

  --save-steps 1500 \

  --logging-steps 50 \

  --fp16 \

  --generation-max-length 192 \

  --generation-num-beams 1 \

  --dataloader-num-workers 0 \

  --no-filter-noisy-pairs


/content/Translator
Generating train split: 49545 examples [00:01, 31998.68 examples/s]
Generating validation split: 505 examples [00:00, 37695.74 examples/s]
Generating test split: 507 examples [00:00, 49386.01 examples/s]
Filtering noisy pairs: 100% 49545/49545 [00:00<00:00, 71161.71 examples/s]
Filtering noisy pairs: 100% 505/505 [00:00<00:00, 55343.95 examples/s]
Filtering noisy pairs: 100% 507/507 [00:00<00:00, 58748.30 examples/s]
Dataset filtering:
{'before': {'train': 49545, 'validation': 505, 'test': 507}, 'after': {'train': 24876, 'validation': 259, 'test': 264}}
config.json: 1.38kB [00:00, 4.68MB/s]
tokenizer_config.json: 100% 42.0/42.0 [00:00<00:00, 214kB/s]
source.spm: 100% 822k/822k [00:00<00:00, 33.1MB/s]
target.spm: 100% 805k/805k [00:00<00:00, 46.1MB/s]
vocab.json: 1.38MB [00:00, 85.1MB/s]
pytorch_model.bin: 100% 300M/300M [00:02<00:00, 134MB/s]  
Loading weights:  94% 242/258 [00:00<00:00, 874.10it/s, Materializing param=model.encoder.layers.5.fc1.bias]               

## Resume after disconnect

In [None]:
import os

import subprocess



os.chdir(PROJECT_DIR)

script = os.path.join(PROJECT_DIR, "scripts", "train_albanian_to_english.py")

checkpoint_path = "outputs/opusmt-alb-en-colab/checkpoint-1500"



base_cmd = [

    "python",

    script,

    "--data-dir", DATA_DIR,

    "--model-name", "Helsinki-NLP/opus-mt-sq-en",

    "--output-dir", "outputs/opusmt-alb-en-colab",

    "--num-train-epochs", "1.0",

    "--per-device-train-batch-size", "4",

    "--per-device-eval-batch-size", "4",

    "--gradient-accumulation-steps", "4",

    "--eval-steps", "1500",

    "--save-steps", "1500",

    "--logging-steps", "50",

    "--fp16",

    "--generation-max-length", "192",

    "--generation-num-beams", "1",

    "--dataloader-num-workers", "0",

    "--no-filter-noisy-pairs",

]



env = os.environ.copy()

env["PYTHONUNBUFFERED"] = "1"

env["OMP_NUM_THREADS"] = "2"

env["TOKENIZERS_PARALLELISM"] = "false"



help_output = subprocess.run(

    ["python", script, "--help"],

    env=env,

    capture_output=True,

    text=True,

)

resume_supported = "--resume-from-checkpoint" in help_output.stdout



cmd = list(base_cmd)

if resume_supported and os.path.isdir(checkpoint_path):

    cmd.extend(["--resume-from-checkpoint", checkpoint_path])

    print(f"Resuming from: {checkpoint_path}")

elif not resume_supported:

    print("Resume flag not supported by current script version; running without resume.")

else:

    print(f"Checkpoint not found at {checkpoint_path}; running without resume.")



print("Running:", " ".join(cmd))

subprocess.run(cmd, env=env, check=True)


/content/Translator
usage: train_albanian_to_english.py [-h] [--data-dir DATA_DIR]
                                    [--model-name MODEL_NAME]
                                    [--output-dir OUTPUT_DIR]
                                    [--max-source-length MAX_SOURCE_LENGTH]
                                    [--max-target-length MAX_TARGET_LENGTH]
                                    [--per-device-train-batch-size PER_DEVICE_TRAIN_BATCH_SIZE]
                                    [--per-device-eval-batch-size PER_DEVICE_EVAL_BATCH_SIZE]
                                    [--learning-rate LEARNING_RATE]
                                    [--num-train-epochs NUM_TRAIN_EPOCHS]
                                    [--weight-decay WEIGHT_DECAY]
                                    [--logging-steps LOGGING_STEPS]
                                    [--eval-steps EVAL_STEPS]
                                    [--save-steps SAVE_STEPS] [--seed SEED]
                                    [