<a href="https://colab.research.google.com/github/HatemMoushir/Sentiment/blob/main/ArEn-TweetSentiment-BERT-Hatem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ تثبيت المكتبات المطلوبة
!pip install -q datasets transformers evaluate pandas scikit-learn

import re
import pandas as pd
import numpy as np
from datasets import load_dataset, concatenate_datasets, Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate
import os

file_path_ar_dataset = "/content/Arabic Tweets Sentiment Classification 2024/Arabic_tweets_sentiment.csv"
file_path_en_dataset = "/content/training.1600000.processed.noemoticon.csv"

if os.path.exists(file_path_ar_dataset):
     print("✅ الملف موجود.")
else:
     print("❌ الملف غير موجود.")
     !wget -q https://prod-dcd-datasets-cache-zipfiles.s3.eu-west-1.amazonaws.com/m88gg52wp7-1.zip -O arabicdata1.zip
     !unzip -o arabicdata1.zip


# ✅ تحميل بيانات التغريدات العربية من UCI



# Read the CSV with the correct separator and rename the column
#ds_ar = pd.read_csv("/content/Arabic Tweets Sentiment Classification 2024/Arabic_tweets_sentiment.csv", encoding='utf-8', sep='\t')
ds_ar = pd.read_csv(file_path_ar_dataset, encoding='utf-8', sep='\t')

ds_ar.rename(columns={'class': 'class'}, inplace=True)


# فلترة السجلات التي تحتوي فقط على Positive أو Negative
ds_ar = ds_ar[ds_ar["class"].isin(["Positive", "Negative"])]


# 🧼 تنظيف النصوص الإنجليزية
def clean_english_text(text):
    text = re.sub(r"http\S+", "", text)     # remove URLs
    text = re.sub(r"@\w+", "", text)        # remove mentions
    text = re.sub(r"#", "", text)           # remove hashtags
    text = re.sub(r"[^\w\s]", "", text)     # remove punctuation
    text = re.sub(r"\d+", "", text)         # remove numbers
    return text.lower().strip()

# 🧼 تطبيع وتنظيف النصوص العربية
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def clean_arabic_text(text):
    text = normalize_arabic(text)
    text = re.sub(r"http\S+", "", text)  # remove URLs
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # remove non-Arabic
    text = re.sub(r"\s+", " ", text)     # normalize whitespace
    return text.strip()

# 🗂️ تجهيز بيانات التغريدات العربية من UCI

# تنظيف وتطبيع
ds_ar["text"] = ds_ar["text"].fillna("").apply(clean_arabic_text)

ds_ar["label"] =ds_ar["class"].replace({"Negative": 0, "Positive": 1}).astype(int)


# اختيار 1000 عينة
ds_ar = ds_ar.sample(10000, random_state=42).reset_index(drop=True)

# تحويل إلى Dataset
ds_ar = Dataset.from_pandas(ds_ar[["text", "label"]])

# 2️⃣ تحميل 1000 تغريدة إنجليزية من Sentiment140 (Stanford)

if os.path.exists(file_path_en_dataset):
     print("✅ الملف موجود.")
else:
      print("❌ الملف غير موجود.")
      wget -q https://cs.stanford.edu/people/alecmgo/tra.iningandtestdata.zip -O sentiment140.zip
      !unzip -o sentiment140.zip

import pandas as pd
import os

cols = ['label', 'id', 'date', 'query', 'user', 'text']
#ds_en = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=cols)

ds_en = pd.read_csv(file_path_en_dataset, encoding='latin-1', names=cols)

# 3. تحويل التصنيفات:
# 0 → سلبي | 4 → إيجابي → نحولها لـ 1
ds_en = ds_en[ds_en['label'].isin([0, 4])]
ds_en['label'] =ds_en['label'].map({0: 0, 4: 1})

# 4. تقليل الحجم لعينة صغيرة للتجربة
ds_en = ds_en.sample(10000, random_state=42).reset_index(drop=True)

# 5. تحويل إلى Dataset
from datasets import Dataset
ds_en = Dataset.from_pandas(ds_en[['text', 'label']])

# 3️⃣ دمج اللغتين في Dataset واحد
ds = concatenate_datasets([
    ds_ar,
    ds_en
]).shuffle(seed=44)

print("✅ إجمالي العبارات:", len(ds))

# 4️⃣ إعداد Tokenizer و Model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

# 5️⃣ تشفير النصوص
def tok(x):
    return tokenizer(x["text"], truncation=True, padding="max_length", max_length=128)

ds = ds.map(tok, batched=True)

# 6️⃣ تقسيم البيانات
split = ds.train_test_split(test_size=0.1, seed=45)
train_ds, test_ds = split["train"], split["test"]

# 7️⃣ إعداد التدريب
accuracy = evaluate.load("accuracy")

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=-1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="sentiment_ar_en_model",
    eval_strategy="epoch", # Changed back to evaluation_strategy
    evaluation_steps=50, # Added evaluation_steps
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    logging_steps=50,
    load_best_model_at_end=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 8️⃣ بدء التدريب
trainer.train()

# 9️⃣ حفظ النموذج
trainer.save_model("sentiment_model_final")
tokenizer.save_pretrained("sentiment_model_final")