In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import evaluate
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_folder = "/content/drive/My Drive/HealthCare-Translation2"

data = []
for folder_name in ["Patient-Educational-Materials", "Patient-Information-Leaflets"]:
    folder_path = os.path.join(base_folder, folder_name)
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".xlsx"):
            file_path = os.path.join(folder_path, file_name)
            # قراءة البيانات من ملفات Excel
            df = pd.read_excel(file_path)
            # C1 هي الإنجليزية و C2 هي العربية
            for eng, arb in zip(df.iloc[:, 0], df.iloc[:, 1]):
                data.append({"translation": {"en": str(eng), "ar": str(arb)}})

print(f"Number of translated pairs: {len(data)}")

Number of translated pairs: 51251


In [None]:
dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 46125
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 5126
    })
})


In [None]:
model_name = "Helsinki-NLP/opus-mt-en-ar"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) #download the model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples['translation']]  # Iterate through the list of dictionaries
    targets = [ex["ar"] for ex in examples['translation']]  # Iterate through the list of dictionaries
    # تحويل النصوص إلى تمثيل رقمي
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, padding="max_length", truncation=True)
    return model_inputs

print(dataset["train"][0])  # عرض أول عينة

{'translation': {'ar': 'سوف يخبرك طبيبك بالضبط عن المدة التي ستحتاج أن تستعمل خلالها الأقراص.', 'en': 'Your doctor will tell you exactly for how long you will need to take the tablets.'}}


In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/46125 [00:00<?, ? examples/s]

Map:   0%|          | 0/5126 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets["train"][0])

{'translation': {'ar': 'سوف يخبرك طبيبك بالضبط عن المدة التي ستحتاج أن تستعمل خلالها الأقراص.', 'en': 'Your doctor will tell you exactly for how long you will need to take the tablets.'}, 'input_ids': [1723, 6418, 104, 843, 30, 4929, 19, 501, 764, 30, 104, 249, 9, 301, 3, 57097, 2, 0, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 62801, 

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",          # مكان حفظ النتائج
    evaluation_strategy="epoch",    # التقييم بعد كل Epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,                      # تسريع باستخدام 16-bit
)


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
#def compute_metrics(pred):
#     labels = pred.label_ids
#     predictions = pred.predictions.argmax(axis=-1)

#     # فك ترميز التوقعات والملصقات
#     predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # استبعاد العينات الفارغة
#     predictions = [pred for pred in predictions if pred.strip()]
#     labels = [label for label in labels if label.strip()]

#     # التحقق من وجود بيانات بعد التنظيف
#     if not predictions or not labels:
#         return {"bleu": 0.0}  # إرجاع قيمة BLEU صفر إذا لم توجد بيانات مناسبة

#     try:
#         result = metric.compute(predictions=predictions, references=[[label] for label in labels])
#     except ZeroDivisionError:
#         print("Warning: ZeroDivisionError encountered in BLEU calculation. Returning 0.0 for BLEU score.")
#         result = {"bleu": 0.0}
#     return {"bleu": result["bleu"]}

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgogomagdi2003[0m ([33mgogomagdi2003-p[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.2044,0.179463
2,0.1694,0.167645
3,0.1511,0.162755
4,0.1426,0.16037
5,0.1368,0.159583




TrainOutput(global_step=14415, training_loss=0.17104201328411228, metrics={'train_runtime': 2601.7778, 'train_samples_per_second': 88.641, 'train_steps_per_second': 5.54, 'total_flos': 7817810411520000.0, 'train_loss': 0.17104201328411228, 'epoch': 5.0})

In [None]:
# evaluation_results = trainer.evaluate()
# print("Evaluation Results: ", evaluation_results)

In [None]:
# تحديد مجلد الحفظ على Google Drive
output_dir = "/content/drive/My Drive/medical-translation-model/test1"

# حفظ النموذج
trainer.save_model(output_dir)

# حفظ الـ Tokenizer
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved successfully to: {output_dir}")



Model and tokenizer saved successfully to: /content/drive/My Drive/medical-translation-model/test1


In [None]:
from transformers import pipeline

# تحميل الموديل المحفوظ
translator = pipeline("translation_en_to_ar", model="/content/drive/My Drive/medical-translation-model/test1", tokenizer=tokenizer)

# تجربة ترجمة نص
result = translator("")
print(result[0]['translation_text'])


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


طریق العین


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# مسار الموديل المحفوظ
model_path = "/content/drive/My Drive/medical-translation-model/test1"

# تحميل tokenizer والموديل
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)




In [None]:
import torch

def translate(text):
    # تحويل النص إلى مدخلات للموديل
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True, padding="max_length")
    # توليد الترجمة
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128, num_beams=5, early_stopping=True)
    # فك ترميز النتائج
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text

# مثال على اختبار الموديل
test_sentence = "The patient should take the medicine after eating."
translated_sentence = translate(test_sentence)
print("Original Sentence: ", test_sentence)
print("Translated Sentence: ", translated_sentence)

Original Sentence:  The patient should take the medicine after eating.
Translated Sentence:  يجب أن يتناول المريض الدواء بعد الأكل.


In [None]:
test_sentences = [
    "Take one tablet twice daily.",
    "This medicine can cause dizziness.",
    "You need to drink more water while taking this medication.",
    "Keep the medicine away from children.",
    "The doctor will check your blood pressure regularly.",
    "Take the medicine two times"
]

for sentence in test_sentences:
    translated = translate(sentence)
    print(f"English: {sentence}")
    print(f"Arabic: {translated}")
    print("-" * 30)


English: Take one tablet twice daily.
Arabic: تناول قرصًا واحدًا مرتين يوميًا.
------------------------------
English: This medicine can cause dizziness.
Arabic: قد يسبب هذا الدواء دوخة.
------------------------------
English: You need to drink more water while taking this medication.
Arabic: تحتاج إلى شرب المزيد من الماء أثناء تناول هذا الدواء.
------------------------------
English: Keep the medicine away from children.
Arabic: احفظ الدواء بعيدًا عن الأطفال.
------------------------------
English: The doctor will check your blood pressure regularly.
Arabic: سيقوم الطبيب بفحص ضغط الدم بانتظام.
------------------------------
English: Take the medicine two times
Arabic: تناول الدواء مرتين
------------------------------
