<a href="https://colab.research.google.com/github/Nemat22534/project_english_to_russian_translation/blob/main/Nemat_Dadashov_210208749_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets sacrebleu sentencepiece

from datasets import load_dataset

# Try loading the opus_books dataset with a safer approach
try:
    dataset = load_dataset("opus_books", "en_ru")
    print("Dataset loaded successfully!")
    # Selecting the first 1000 training samples and 200 validation samples
    train_data = dataset["train"].select(range(1000))  # 1000 samples for training
    val_data = dataset["validation"].select(range(200))  # 200 samples for validation

    # Check the structure of the selected subsets
    print(f"Training data sample size: {len(train_data)}")
    print(f"Validation data sample size: {len(val_data)}")

except Exception as e:
    print(f"Error loading dataset: {e}")


def tokenize(examples):
    # Extract translations (handling potential multiple translations)
    translations = [ex[0] if isinstance(ex, list) else ex for ex in examples["translation"]]

    # Prepare English translations for tokenization
    en_texts = [t["en"] for t in translations]

    # Prepare Russian translations for tokenization
    ru_texts = [t["ru"] for t in translations]

    # Tokenize English translations
    inputs = tokenizer(en_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    # Tokenize Russian translations
    targets = tokenizer(ru_texts, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    # Add the Russian tokens as labels
    inputs["labels"] = targets["input_ids"]

    return inputs

from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

import transformers
print(transformers.__version__)


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",          # Where to save the model and logs
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    logging_dir="./logs",            # Directory for logs
    logging_steps=500,               # Log every 500 steps
    save_steps=1000,                 # Save model every 1000 steps
    load_best_model_at_end=False,    # Disable loading the best model
    weight_decay=0.01,               # Weight decay
)


from transformers import MarianMTModel, MarianTokenizer, pipeline

# Load the fine-tuned model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-ru"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Create a translation pipeline
translator = pipeline("translation_en_to_ru", model=model, tokenizer=tokenizer)

# Example English sentences
english_sentences = ["Hi, My name is Nemat Dadashov"]



# Translate
russian_translations = translator(english_sentences)

# Display the results
for en, ru in zip(english_sentences, russian_translations):
    print(f"ENGLISH: {en}")
    print(f"RUSSIAN: {ru['translation_text']}\n")
