In [None]:
import torch
from datasets import Dataset
from transformers import (
    BertConfig, GPT2Config, EncoderDecoderConfig,
    EncoderDecoderModel, AutoTokenizer, Trainer, TrainingArguments
)

In [None]:
# ۱. دیتاست (تعداد داده را کمی بیشتر کردیم تا مدل الگو را ببیند)
data = {
    "en_text": ["hello", "bye", "good morning", "good night", "welcome", "how are you"],
    "fa_text": ["سلام", "خداحافظ", "صبح بخیر", "شب بخیر", "خوش آمدید", "چطوری"]
}
dataset = Dataset.from_dict(data)

In [None]:
# ۲. توکنایزر (نسخه چندزبانه)
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer.pad_token = tokenizer.sep_token

def preprocess_function(examples):
    inputs = tokenizer(examples["en_text"], padding="max_length", truncation=True, max_length=12)
    outputs = tokenizer(examples["fa_text"], padding="max_length", truncation=True, max_length=12)
    # جایگزین کردن pad_token_id با -100 تا در محاسبه Loss نادیده گرفته شوند
    labels = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in outputs["input_ids"]]
    inputs["labels"] = labels
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
# ۳. معماری کاستوم (همگام با توکنایزر)
v_size = len(tokenizer)

# تنظیمات انکودر
config_encoder = BertConfig(
    vocab_size=v_size, num_hidden_layers=4, num_attention_heads=4, hidden_size=256, intermediate_size=512
)

# تنظیمات دکودر
config_decoder = GPT2Config(
    vocab_size=v_size, n_layer=4, n_head=4, n_embd=256,
    add_cross_attention=True, is_decoder=True
)

# ترکیب کانفیگ‌ها
config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = EncoderDecoderModel(config=config)

In [None]:
# ۴. تنظیمات حیاتی برای جلوگیری از خروجی خالی
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.sep_token_id

In [None]:
# ۵. تنظیمات آموزش (افزایش اپوک برای یادگیری مدل خام)
training_args = TrainingArguments(
    output_dir="./translator_final",
    num_train_epochs=300,
    per_device_train_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_datasets)

print("شروع آموزش مدل کاستوم...")
trainer.train()

شروع آموزش مدل کاستوم...




Step,Training Loss
100,8.440669
200,6.116086
300,4.790572
400,3.829462
500,3.164478
600,2.839893


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=600, training_loss=4.863526611328125, metrics={'train_runtime': 52.4475, 'train_samples_per_second': 34.32, 'train_steps_per_second': 11.44, 'total_flos': 828013363200.0, 'train_loss': 4.863526611328125, 'epoch': 300.0})

In [None]:
# ۶. تابع ترجمه اصلاح شده (Inference)
def translate(text):
    model.eval()
    # پیش‌پردازش ورودی
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            decoder_start_token_id=model.config.decoder_start_token_id,
            eos_token_id=model.config.eos_token_id,
            max_new_tokens=10,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=1
        )

    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [None]:
# ۷. تست نهایی
print("\n" + "="*30)
test_words = ["hello", "good morning", "welcome"]
for w in test_words:
    print(f"English: {w} ---> Persian: {translate(w)}")

Setting `pad_token_id` to `eos_token_id`:102 for open-end generation.





Setting `pad_token_id` to `eos_token_id`:102 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:102 for open-end generation.


English: hello ---> Persian: ##وش سلام آمدید بخیر صبح خدا
English: good morning ---> Persian: ##ید صبح بخیر چطوری سلام
English: welcome ---> Persian: ب خوش آمدیدخیر سلامبح خدا
