# Albanian Translator Training (Colab)

This notebook runs the existing project training pipeline on Colab GPU and saves everything to Google Drive.

In [1]:
import os
from pathlib import Path

USE_DRIVE = True
PROJECT_DIR = "/content/Translator"
DATA_DIR = "data/alb_en"

if USE_DRIVE:
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        PROJECT_DIR = "/content/drive/MyDrive/Translator"
    except Exception as error:
        print("Drive mount failed, falling back to local /content storage.")
        print(error)

Path(PROJECT_DIR).mkdir(parents=True, exist_ok=True)
print("PROJECT_DIR:", PROJECT_DIR)
print("DATA_DIR:", DATA_DIR)
print("Persistence:", "Drive" if PROJECT_DIR.startswith('/content/drive') else "Ephemeral /content")

Drive mount failed, falling back to local /content storage.
Failed to issue request POST https://colab.research.google.com/tun/m/credentials-propagation/m-s-28it4rnpehd4u?authtype=dfs_ephemeral&version=2&dryrun=false&propagate=true&record=false&authuser=0&authuser=0: Bad Request
Response body: 
<!DOCTYPE html>
<html lang=en>
  <meta charset=utf-8>
  <meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width">
  <title>Error 400 (Bad Request)!!1</title>
  <style>
    *{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.g

### Storage mode

- Notebook is now set to **Google Drive by default** (`USE_DRIVE = True`) so checkpoints persist.
- If Drive mount fails, it falls back to local `/content/Translator` (temporary storage).
- Keep `DATA_DIR = "data/alb_en"` unless you intentionally change dataset location.

## Clone or update project in Drive\n
Set your repo URL below, then run.

In [2]:
import os
import subprocess

REPO_URL = "https://github.com/GjergjBrestovci/Translator.git"  # set this first

if not os.path.exists(PROJECT_DIR):
    os.makedirs(PROJECT_DIR, exist_ok=True)

if os.path.exists(os.path.join(PROJECT_DIR, ".git")):
    print("Repo exists, pulling latest...")
    subprocess.run(["git", "-C", PROJECT_DIR, "pull"], check=False)
else:
    if "<your-user>" in REPO_URL or "<your-repo>" in REPO_URL:
        raise ValueError("Set REPO_URL to your actual GitHub repository before continuing.")
    print("Cloning repo...")
    subprocess.run(["git", "clone", REPO_URL, PROJECT_DIR], check=True)

script_path = os.path.join(PROJECT_DIR, "scripts", "train_albanian_to_english.py")
if not os.path.exists(script_path):
    raise FileNotFoundError(
        f"Missing {script_path}. Confirm REPO_URL points to this Translator project."
    )

print("Project ready at:", PROJECT_DIR)
print("Found training script:", script_path)

Cloning repo...
Project ready at: /content/Translator
Found training script: /content/Translator/scripts/train_albanian_to_english.py


: 

In [3]:
%cd $PROJECT_DIR
!test -f scripts/train_albanian_to_english.py || (echo "Training script missing. Run the repo setup cell first." && exit 1)
%pip install -U pip
%pip install -r requirements.txt

/content/Translator
Collecting pip
  Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-26.0.1
Collecting evaluate>=0.4.0 (from -r requirements.txt (line 3))
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu>=2.4.0 (from -r requirements.txt (line 4))
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu>=2.4.0->-r requirements.txt (line 4))
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu>=2.4.0->-r requirements.txt (line 4))
  Downloading colorama-0.4.6-py2.py3-none-any.wh

: 

## Optional: rebuild/expand dataset (rows-api stable mode)

In [4]:
%cd $PROJECT_DIR
!PYTHONUNBUFFERED=1 python scripts/prepare_dataset.py \
  --subsets aln_Latn als_Latn \
  --output-dir $DATA_DIR \
  --data-backend rows-api \
  --max-samples-per-subset 50000 \
  --rows-api-page-size 100 \
  --rows-api-retries 8 \
  --rows-api-retry-wait-seconds 2.0 \
  --rows-api-request-interval-seconds 0.15 \
  --drop-early-stop

/content/Translator
Streaming subset: aln_Latn
aln_Latn: 563it [00:02, 199.08it/s]
Streaming subset: als_Latn
als_Latn: 50000it [15:14, 54.65it/s] 
Dataset prepared:
{
  "subsets": [
    "aln_Latn",
    "als_Latn"
  ],
  "num_total": 50557,
  "num_train": 49545,
  "num_validation": 505,
  "num_test": 507,
  "data_backend": "rows-api",
  "max_samples_per_subset": 50000,
  "min_source_chars": 20,
  "drop_early_stop": true,
  "seed": 42
}


: 

## Train (cool/stable defaults)

In [5]:
%cd $PROJECT_DIR

!PYTHONUNBUFFERED=1 OMP_NUM_THREADS=2 TOKENIZERS_PARALLELISM=false python scripts/train_albanian_to_english.py \

  --data-dir $DATA_DIR \

  --model-name Helsinki-NLP/opus-mt-sq-en \

  --output-dir outputs/opusmt-alb-en-colab \

  --num-train-epochs 1.0 \

  --per-device-train-batch-size 4 \

  --per-device-eval-batch-size 4 \

  --gradient-accumulation-steps 4 \

  --eval-steps 1500 \

  --save-steps 1500 \

  --logging-steps 50 \

  --fp16 \

  --generation-max-length 192 \

  --generation-num-beams 1 \

  --dataloader-num-workers 0 \

  --no-filter-noisy-pairs


IndentationError: unexpected indent (ipython-input-2412385832.py, line 4)

: 

## Resume after disconnect

In [None]:
import os

import subprocess



os.chdir(PROJECT_DIR)

script = os.path.join(PROJECT_DIR, "scripts", "train_albanian_to_english.py")

checkpoint_path = "outputs/opusmt-alb-en-colab/checkpoint-1500"



base_cmd = [

    "python",

    script,

    "--data-dir", DATA_DIR,

    "--model-name", "Helsinki-NLP/opus-mt-sq-en",

    "--output-dir", "outputs/opusmt-alb-en-colab",

    "--num-train-epochs", "1.0",

    "--per-device-train-batch-size", "4",

    "--per-device-eval-batch-size", "4",

    "--gradient-accumulation-steps", "4",

    "--eval-steps", "1500",

    "--save-steps", "1500",

    "--logging-steps", "50",

    "--fp16",

    "--generation-max-length", "192",

    "--generation-num-beams", "1",

    "--dataloader-num-workers", "0",

    "--no-filter-noisy-pairs",

]



env = os.environ.copy()

env["PYTHONUNBUFFERED"] = "1"

env["OMP_NUM_THREADS"] = "2"

env["TOKENIZERS_PARALLELISM"] = "false"



help_output = subprocess.run(

    ["python", script, "--help"],

    env=env,

    capture_output=True,

    text=True,

)

resume_supported = "--resume-from-checkpoint" in help_output.stdout



cmd = list(base_cmd)

if resume_supported and os.path.isdir(checkpoint_path):

    cmd.extend(["--resume-from-checkpoint", checkpoint_path])

    print(f"Resuming from: {checkpoint_path}")

elif not resume_supported:

    print("Resume flag not supported by current script version; running without resume.")

else:

    print(f"Checkpoint not found at {checkpoint_path}; running without resume.")



print("Running:", " ".join(cmd))

subprocess.run(cmd, env=env, check=True)


/content/Translator
usage: train_albanian_to_english.py [-h] [--data-dir DATA_DIR]
                                    [--model-name MODEL_NAME]
                                    [--output-dir OUTPUT_DIR]
                                    [--max-source-length MAX_SOURCE_LENGTH]
                                    [--max-target-length MAX_TARGET_LENGTH]
                                    [--per-device-train-batch-size PER_DEVICE_TRAIN_BATCH_SIZE]
                                    [--per-device-eval-batch-size PER_DEVICE_EVAL_BATCH_SIZE]
                                    [--learning-rate LEARNING_RATE]
                                    [--num-train-epochs NUM_TRAIN_EPOCHS]
                                    [--weight-decay WEIGHT_DECAY]
                                    [--logging-steps LOGGING_STEPS]
                                    [--eval-steps EVAL_STEPS]
                                    [--save-steps SAVE_STEPS] [--seed SEED]
                                    [

: 

: 