<a href="https://colab.research.google.com/github/HatemMoushir/Sentiment/blob/main/sent140_multilingual_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ تثبيت المكتبات المطلوبة
!pip install -q datasets transformers evaluate pandas

import re
import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate

# 🧼 تنظيف النصوص الإنجليزية
def clean_english_text(text):
    text = re.sub(r"http\S+", "", text)     # remove URLs
    text = re.sub(r"@\w+", "", text)        # remove mentions
    text = re.sub(r"#", "", text)           # remove hashtags
    text = re.sub(r"[^\w\s]", "", text)     # remove punctuation
    text = re.sub(r"\d+", "", text)         # remove numbers
    return text.lower().strip()

# 🧼 تطبيع وتنظيف النصوص العربية
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def clean_arabic_text(text):
    text = normalize_arabic(text)
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # remove non-Arabic
    text = re.sub(r"\s+", " ", text)     # normalize whitespace
    return text.strip()

# 1️⃣ تحميل 1000 تغريدة عربية من ArSenTD‑LEV
ds_ar = load_dataset("ramybaly/arsentd_lev", split="train")
ds_ar = ds_ar.filter(lambda x: x["Sentiment"] in [1, 2]).shuffle(seed=42).select(range(1000))
ds_ar = ds_ar.map(lambda x: {
    "text": clean_arabic_text(x["Tweet"]),
    "label": 0 if x["Sentiment"] == 1 else 1
})

# 2️⃣ تحميل 1000 تغريدة إنجليزية من Sentiment140 (Stanford)
!wget -q https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip -O sentiment140.zip
!unzip -o sentiment140.zip

ds_en = load_dataset("sentiment140", split="train")
ds_en = ds_en.filter(lambda x: x["sentiment"] in [0, 4]).shuffle(seed=43).select(range(1000))
ds_en = ds_en.map(lambda x: {
    "text": clean_english_text(x["text"]),
    "label": 0 if x["sentiment"] == 0 else 1
})

# 3️⃣ دمج اللغتين في Dataset واحد
ds = concatenate_datasets([
    ds_ar.remove_columns([col for col in ds_ar.column_names if col not in ["text", "label"]]),
    ds_en.remove_columns([col for col in ds_en.column_names if col not in ["text", "label"]])
]).shuffle(seed=44)

print("✅ إجمالي العبارات:", len(ds))

# 4️⃣ إعداد Tokenizer و Model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# 5️⃣ تشفير النصوص
def tok(x):
    return tokenizer(x["text"], truncation=True, padding="max_length", max_length=128)

ds = ds.map(tok, batched=True)

# 6️⃣ تقسيم البيانات
split = ds.train_test_split(test_size=0.1, seed=45)
train_ds, test_ds = split["train"], split["test"]

# 7️⃣ إعداد التدريب
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    return accuracy.compute(predictions=np.argmax(p.predictions, axis=-1), references=p.label_ids)

training_args = TrainingArguments(
    output_dir="sentiment_ar_en_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 8️⃣ بدء التدريب
trainer.train()