In [28]:
import json
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate
from datasets import load_dataset

In [29]:
# === CONFIGURATION ===
model_name = "facebook/nllb-200-distilled-600M"
source_lang = "eng_Latn"
target_lang = "bul_Cyrl"
source_srt_path = "data/source.srt"
target_srt_path = "data/target.srt"
json_output = "subtitles_dataset.json"
model_output_dir = "output/nllb_finetuned_subtitles"
epochs = 5
batch_size = 4

In [30]:
# === STEP 1: Parse SRT files ===
def parse_srt(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()

    blocks = []
    current = {'text': ''}

    for line in lines:
        line = line.strip()
        if line.isdigit():
            if current['text']:
                blocks.append(current['text'].strip())
                current = {'text': ''}
        elif '-->' in line:
            continue
        elif line:
            current['text'] += ' ' + line

    if current['text']:
        blocks.append(current['text'].strip())

    return blocks

print("📥 Parsing subtitles...")
src_blocks = parse_srt(source_srt_path)
tgt_blocks = parse_srt(target_srt_path)
print("Done parsing the subtitle.")

# Create JSONL dataset
with open(json_output, "w", encoding="utf-8") as f:
    for src, tgt in zip(src_blocks, tgt_blocks):
        json.dump({"translation": {"src": src, "tgt": tgt}}, f, ensure_ascii=False)
        f.write("\n")

📥 Parsing subtitles...
Done parsing the subtitle.


In [31]:
# === STEP 2: Load and split dataset ===
print("📊 Loading dataset...")
dataset = load_dataset("json", data_files=json_output, split="train")
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split["train"]
test_dataset = dataset_split["test"]

print("Training dataset: ", train_dataset)
print("Test dataset: ", test_dataset)

📊 Loading dataset...


Generating train split: 0 examples [00:00, ? examples/s]

Training dataset:  Dataset({
    features: ['translation'],
    num_rows: 5
})
Test dataset:  Dataset({
    features: ['translation'],
    num_rows: 2
})


In [44]:
# === STEP 3: Tokenization ===
print("🧠 Tokenizing...")
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=source_lang, tgt_lang=target_lang)

def tokenize_fn(batch):
    # Extract texts from the batch (already comes as lists)
    src_texts = [item['src'] for item in batch['translation']]
    tgt_texts = [item['tgt'] for item in batch['translation']]

    model_inputs = tokenizer(
        src_texts,
        max_length=128,
        padding="max_length",
        truncation=True,
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            tgt_texts,
            max_length=128,
            padding="max_length",
            truncation=True,
        )["input_ids"]

    model_inputs["labels"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in label_seq]
        for label_seq in labels
    ]
    return model_inputs


# Update dataset mapping with error handling

print(train_dataset)
print(test_dataset)

try:
    train_dataset = train_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    test_dataset = test_dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=test_dataset.column_names
    )
except Exception as e:
    print(f"Error during tokenization: {str(e)}")
    raise

🧠 Tokenizing...
Dataset({
    features: ['translation'],
    num_rows: 5
})
Dataset({
    features: ['translation'],
    num_rows: 2
})


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [47]:
# === STEP 4: Training Setup ===
print("🏋️ Starting training setup...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = Seq2SeqTrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    save_strategy="epoch",
    logging_dir="./logs",
    predict_with_generate=True,
    fp16=True,  # Set False if you don't use GPU
)

bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return bleu.compute(predictions=decoded_preds, references=[[lbl] for lbl in decoded_labels])

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# === STEP 5: Train ===
print("🚀 Training...")
trainer.train()

# === STEP 6: Save Model ===
print("💾 Saving fine-tuned model...")
trainer.save_model(model_output_dir)
tokenizer.save_pretrained(model_output_dir)





🏋️ Starting training setup...


  trainer = Seq2SeqTrainer(


🚀 Training...


Step,Training Loss




KeyboardInterrupt: 

In [50]:
print("🧪 Running a test translation...")
test_text = "Where are you going?"

# Set source and target languages
tokenizer.src_lang = source_lang

# Tokenize input text
inputs = tokenizer(test_text, return_tensors="pt")

# Get the target language token ID
forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

# Generate translation
generated = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id)

# Decode and print
print("🔤 Translation:", tokenizer.decode(generated[0], skip_special_tokens=True))


🧪 Running a test translation...
🔤 Translation: Къде отиваш? - Да.


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Path to the saved fine-tuned model
model_path = "output/nllb_finetuned_subtitles"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Example input sentence
text = "Where are you going?"

# Set language codes
source_lang = "eng_Latn"
target_lang = "bul_Cyrl"

# Tokenize input
tokenizer.src_lang = source_lang  # set the source language
inputs = tokenizer(text, return_tensors="pt")

# Get token ID for the target language
forced_bos_token_id = tokenizer.convert_tokens_to_ids(target_lang)

# Generate translation
generated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=forced_bos_token_id
)

# Decode translation
translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print("Translation:", translation)

he directory nllb_finetuned_subtitles/ must contain:

    pytorch_model.bin

    config.json

    tokenizer.json

    tokenizer_config.json

    special_tokens_map.json