<a href="https://colab.research.google.com/github/MariamEmad111/graduation-project-fashion-search-ai/blob/main/lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets peft accelerate bitsandbytes --quiet

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
from datasets import Dataset


df = pd.read_csv('/content/drive/MyDrive/clothing_dataset_ar_en_translated.csv')

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pyto

In [None]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

model_path = "/content/drive/MyDrive/mBART/mBART"

tokenizer = MBart50TokenizerFast.from_pretrained(model_path, local_files_only=True)
model = MBartForConditionalGeneration.from_pretrained(model_path, local_files_only=True)

In [None]:
max_len = 64

def preprocess(examples):
    inputs = tokenizer(
        examples['Arabic'],
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    targets = tokenizer(
        examples['English'],
        max_length=max_len,
        padding="max_length",
        truncation=True
    )

    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized = dataset.map(preprocess, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, config)

In [None]:
!pip install -U transformers



In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart50-lora-fashion",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-4,
    num_train_epochs=5,
    #evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    save_total_limit=2,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

trainer.train()

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,10.5923
20,9.8766
30,9.5058
40,9.1935
50,9.1065
60,9.0013
70,8.9649
80,8.8516
90,8.8108
100,8.7341


TrainOutput(global_step=5625, training_loss=8.629172721354166, metrics={'train_runtime': 1089.3694, 'train_samples_per_second': 41.308, 'train_steps_per_second': 5.164, 'total_flos': 6115436789760000.0, 'train_loss': 8.629172721354166, 'epoch': 5.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/mbart50_finetuned_lora")

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_path = "/content/drive/MyDrive/mbart50_finetuned_lora"


tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)


input_text = "فستان أحمر ستان من بريشكا"


inputs = tokenizer(input_text, return_tensors="pt")


generated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
    max_length=50,
    num_beams=4,
    early_stopping=True
)


output_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

print("Arabic:", input_text)
print("English:", output_text)

Arabic: فستان أحمر ستان من بريشكا
English: A red dress from Bershka


In [7]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_path = "/content/drive/MyDrive/mbart50_finetuned_lora"


tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
model = MBartForConditionalGeneration.from_pretrained(model_path)

In [3]:

!pip install evaluate sacrebleu


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py

In [8]:
from datasets import load_dataset, load_metric
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
from tqdm import tqdm
import evaluate


tokenizer.src_lang = "ar_AR"

# 4.Batching
def compute_bleu_batch(model, tokenizer, dataset, max_len=64, batch_size=16):
    metric = evaluate.load("sacrebleu")
    predictions = []
    references = []

    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        input_texts = batch['Arabic']
        target_texts = batch['English']

        inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
                max_length=max_len
            )
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded_preds)
        references.extend([[ref] for ref in target_texts])  # BLEU expects list of list

    bleu = metric.compute(predictions=predictions, references=references)
    return bleu

# 5. test
sample_dataset = dataset['test'].select(range(1000))
results = compute_bleu_batch(model, tokenizer, sample_dataset, batch_size=16)
print(f"\n✅ BLEU score: {results['score']:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

100%|██████████| 63/63 [28:43<00:00, 27.36s/it]


✅ BLEU score: 86.08



