<a href="https://colab.research.google.com/github/HatemMoushir/Sentiment/blob/main/sent140_multilingual_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# ✅ 1. تحميل البيانات الإنجليزية (Sentiment140)
!wget -q https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
!unzip -o trainingandtestdata.zip

# ✅ 2. تثبيت الأدوات
!pip install -q datasets transformers evaluate pandas

# ✅ 3. قراءة وتصفية بيانات Sentiment140 (1000 تغريدة إنجليزية فقط)
import pandas as pd
import os
import re

cols = ['label', 'id', 'date', 'query', 'user', 'text']
df_en = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=cols)
df_en = df_en[df_en['label'].isin([0, 4])]
df_en['label'] = df_en['label'].map({0: 0, 4: 1})
df_en = df_en[['text', 'label']].sample(1000, random_state=42).reset_index(drop=True)

# ✅ 4. تحميل بيانات ASTD (1000 تغريدة عربية فقط)
!wget -O ASTD_cleaned.csv https://raw.githubusercontent.com/hazem-taha/arabic-sentiment-analysis/master/datasets/ASTD/clean-dataset/ASTD_cleaned.csv

df_ar = pd.read_csv("ASTD_cleaned.csv")
df_ar = df_ar[['text', 'label']].dropna()
df_ar = df_ar[df_ar['label'].isin(['Positive', 'Negative'])]
df_ar['label'] = df_ar['label'].map({'Negative': 0, 'Positive': 1})
df_ar = df_ar.sample(1000, random_state=42).reset_index(drop=True)

# ✅ 5. تنظيف النصوص
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^\w\s\u0600-\u06FF]", "", text)  # إبقاء الحروف العربية والإنجليزية فقط
    text = re.sub(r"\s+", " ", text).strip()
    return text

df_en['text'] = df_en['text'].apply(clean_text)
df_ar['text'] = df_ar['text'].apply(clean_text)

# ✅ 6. دمج البيانات (1000 عربي + 1000 إنجليزي)
df_all = pd.concat([df_en, df_ar]).sample(frac=1, random_state=42).reset_index(drop=True)

# ✅ 7. تحويل لـ Huggingface Dataset
from datasets import Dataset
dataset = Dataset.from_pandas(df_all)

# ✅ 8. تحميل tokenizer & model (Multilingual BERT)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length")

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.1)

# ✅ 9. تحميل الموديل
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# ✅ 10. الإعدادات والتدريب
accuracy = evaluate.load("accuracy")

os.environ["WANDB_DISABLED"] = "true"

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

args = TrainingArguments(
    output_dir="sent140_ar_model",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
    report_to=None,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ✅ بدء التدريب
trainer.train()

# ✅ حفظ النموذج
trainer.save_model("sent140_ar_model")
tokenizer.save_pretrained("sent140_ar_model")

# ✅ اختبار النموذج
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="sent140_ar_model", tokenizer="sent140_ar_model")

# ✅ تجربة نموذج على عربي وإنجليزي
print(classifier("اليوم جميل جدًا!"))
print(classifier("This is the worst movie I've ever seen."))