In [1]:
!apt-get update
!apt-get install tesseract-ocr -y
!pip install pytesseract opencv-python pillow transformers datasets sentencepiece sacrebleu evaluate

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connectin0% [Waiting for headers] [Waiting for headers] [Waiting for headers] [Connected                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease

In [2]:
# Cell 2: Crop Images from YOLO Annotations
# ------------------------------
import os
import cv2

image_dir = "/content/images and annotation "
output_dir = "/content/crops"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(image_dir):
    if filename.endswith(('.jpg', '.png')):
        image_path = os.path.join(image_dir, filename)
        annot_path = os.path.splitext(image_path)[0] + '.txt'

        img = cv2.imread(image_path)
        h, w = img.shape[:2]

        if os.path.exists(annot_path):
            with open(annot_path, 'r') as f:
                for i, line in enumerate(f):
                    parts = line.strip().split()
                    if len(parts) == 5:
                        cls, xc, yc, bw, bh = map(float, parts)
                        x1 = int((xc - bw / 2) * w)
                        y1 = int((yc - bh / 2) * h)
                        x2 = int((xc + bw / 2) * w)
                        y2 = int((yc + bh / 2) * h)
                        crop = img[y1:y2, x1:x2]
                        out_path = os.path.join(output_dir, f"{filename[:-4]}_crop_{i}.png")
                        cv2.imwrite(out_path, crop)

In [3]:
#  OCR + Translation Functions
# ------------------------------
import pytesseract
from transformers import MarianMTModel, MarianTokenizer

model_name = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def extract_text(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    th = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return pytesseract.image_to_string(th)

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    output = model.generate(**inputs)
    return tokenizer.decode(output[0], skip_special_tokens=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
#  Process Crops and Translate
# ------------------------------
results = []
for f in os.listdir(output_dir):
    if f.endswith('.png'):
        path = os.path.join(output_dir, f)
        eng = extract_text(path).strip()
        if eng:
            ar = translate(eng)
            results.append((f, eng, ar))

for r in results[2:7]:
    print(f"Image: {r[0]}\nEN: {r[1]}\nAR: {r[2]}\n{'-'*40}")

Image: 3_crop_1.png
EN: ARIS
AR: هذه المؤسسة
----------------------------------------
Image: 77_crop_3.png
EN: AA
AR: A AA
----------------------------------------
Image: 186_crop_8.png
EN: ity
AR: أط
----------------------------------------
Image: 186_crop_3.png
EN: Sine!
AR: (سين) ، (سين) ، (سين) ، (سين) ، (سين) ، (سين) ، (سين)
----------------------------------------
Image: 137_crop_2.png
EN: He)!
AR: هو) !!
----------------------------------------


In [7]:
#  Save Data for Fine-Tuning
# ------------------------------
import pandas as pd

df = pd.DataFrame(results, columns=["image", "en", "ar"])
df.to_csv("/content/en_ar.csv", index=False)


In [8]:
# Prepare Dataset for Fine-Tuning
# ------------------------------
from datasets import Dataset

data = pd.read_csv("/content/en_ar.csv")
data = data[["en", "ar"]].dropna().drop_duplicates()
dataset = Dataset.from_pandas(data)

def preprocess(example):
    inputs = tokenizer(example["en"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(example["ar"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

In [10]:
#  Fine-Tune the Model
# ------------------------------
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

args = Seq2SeqTrainingArguments(
output_dir="/content/opus-finetuned",
per_device_train_batch_size=4,
num_train_epochs=3,
logging_dir="/content/logs",
save_total_limit=1,
save_steps=500, # instead of save_strategy="epoch"
logging_steps=100,
fp16=False,
report_to=[] # disables wandb or hub logging
)

trainer = Seq2SeqTrainer(
model=model,
args=args,
train_dataset=tokenized_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

trainer.train()


Step,Training Loss
100,0.5338




TrainOutput(global_step=102, training_loss=0.5275902450084686, metrics={'train_runtime': 17.4427, 'train_samples_per_second': 23.047, 'train_steps_per_second': 5.848, 'total_flos': 13627142701056.0, 'train_loss': 0.5275902450084686, 'epoch': 3.0})

In [13]:
# Save the model after training
model.save_pretrained("/content/opus-finetuned")
tokenizer.save_pretrained("/content/opus-finetuned")

('/content/opus-finetuned/tokenizer_config.json',
 '/content/opus-finetuned/special_tokens_map.json',
 '/content/opus-finetuned/vocab.json',
 '/content/opus-finetuned/source.spm',
 '/content/opus-finetuned/target.spm',
 '/content/opus-finetuned/added_tokens.json')

In [14]:
#  Test Fine-Tuned Model
# ------------------------------
ft_model = MarianMTModel.from_pretrained("/content/opus-finetuned")
ft_tokenizer = MarianTokenizer.from_pretrained("/content/opus-finetuned")

def ft_translate(text):
    inputs = ft_tokenizer(text, return_tensors="pt", truncation=True)
    output = ft_model.generate(**inputs)
    return ft_tokenizer.decode(output[0], skip_special_tokens=True)

# Test
print(ft_translate("Welcome to the shopping center."))




- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

In [15]:
result = ft_translate("Welcome to the shopping center.")
print(repr(result))


'- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

In [20]:


print("\n--- Testing Base Model ---")

test_text = "Welcome to the shopping center."
inputs = tokenizer(test_text, return_tensors="pt", truncation=True)


device = model.device # Get the device of the model
inputs = {k: v.to(device) for k, v in inputs.items()} # Move all input tensors to the model's device

output = model.generate(**inputs)
base_model_result = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Input English: {test_text}")
print(f"Base Model Translation (Arabic): {base_model_result}")
print(repr(base_model_result))

print("\n--- Testing Fine-Tuned Model (for comparison) ---")




--- Testing Base Model ---
Input English: Welcome to the shopping center.
Base Model Translation (Arabic): - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

In [18]:
import pandas as pd
df = pd.read_csv("/content/en_ar.csv")
print(df.head(9))


            image               en  \
0  123_crop_1.png               t9   
1   56_crop_0.png  Flaca Ae ntl gl   
2    3_crop_1.png             ARIS   
3   77_crop_3.png               AA   
4  186_crop_8.png              ity   
5  186_crop_3.png            Sine!   
6  137_crop_2.png             He)!   
7  139_crop_3.png         Internat   
8   75_crop_4.png               ag   

                                                  ar  
0                                                NaN  
1                                       Ae n t tl gl  
2                                        هذه المؤسسة  
3                                               A AA  
4                                                 أط  
5  (سين) ، (سين) ، (سين) ، (سين) ، (سين) ، (سين) ...  
6                                             هو) !!  
7                                          انت نفسكت  
8                                                NaN  


In [21]:
import os
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset

In [22]:
# ✅ Load image paths (we assume each image has paired text: en, ar)
train_file = "/content/train.txt"
with open(train_file, "r") as f:
    image_paths = [line.strip() for line in f.readlines()]


In [23]:
# ✅ Assume you have matching clean CSV (or we create a synthetic one for now)
data = {
    "en": ["This is example sentence {}.".format(i) for i in range(len(image_paths))],
    "ar": ["هذه جملة مثال {}.".format(i) for i in range(len(image_paths))]
}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

# ✅ Load Helsinki-NLP English-Arabic model
model_name = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)




In [24]:
# ✅ Tokenize
def preprocess(examples):
    inputs = examples["en"]
    targets = examples["ar"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/13712 [00:00<?, ? examples/s]



In [25]:

# ✅ Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/opus-finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
    fp16=False,
    report_to=[],
)

In [26]:

# ✅ Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)


In [27]:

# ✅ Train and Save
trainer.train()
model.save_pretrained("/content/opus-finetunedd")
tokenizer.save_pretrained("/content/opus-finetunedd")

Step,Training Loss
100,0.0733
200,0.0564
300,0.0762
400,0.0552
500,0.0217
600,0.0409
700,0.0542
800,0.0418
900,0.0362
1000,0.0202




('/content/opus-finetunedd/tokenizer_config.json',
 '/content/opus-finetunedd/special_tokens_map.json',
 '/content/opus-finetunedd/vocab.json',
 '/content/opus-finetunedd/source.spm',
 '/content/opus-finetunedd/target.spm',
 '/content/opus-finetunedd/added_tokens.json')

In [34]:
# ✅ Test translated output
ft_model = MarianMTModel.from_pretrained("/content/opus-finetunedd")
ft_tokenizer = MarianTokenizer.from_pretrained("/content/opus-finetunedd")

def ft_translate(text):
    inputs = ft_tokenizer(text, return_tensors="pt", truncation=True)
    outputs = ft_model.generate(**inputs)
    return ft_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(ft_translate("Welcome to the shopping center]]."))


مرحباً بكِ في مركز التسوق
