# Sentiment Analysis for Arabic Text using BERT

This notebook fine-tunes a BERT model for Arabic sentiment analysis

## Import Dependencies

In [1]:
!pip install -U transformers datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency res

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import re
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Text Preprocessing and Cleaning


In [4]:
ar_stopwords = set(stopwords.words('arabic'))

def clean_text(text):
    search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى",
              "\\", '\n', '\t', '"', '?', '؟', '!']
    replace = ["ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", " يا",
               "", "", "", "ي", "", ' ', ' ', ' ', ' ? ', ' ؟ ', ' ! ']

    # remove tashkeel
    tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(tashkeel, "", text)

    # normalize elongation
    longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(longation, subst, text)

    text = re.sub(r"[^\w\s]", '', text)             # remove punctuation
    text = re.sub(r"[a-zA-Z]", '', text)            # remove English letters
    text = re.sub(r"\d+", ' ', text)                # remove digits
    text = re.sub(r"\n+", ' ', text)                # remove newlines
    text = re.sub(r"\t+", ' ', text)                # remove tabs
    text = re.sub(r"\r+", ' ', text)                # remove carriage returns
    text = re.sub(r"\s+", ' ', text)                # normalize spaces

    # remove specific character repetitions
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')

    for i in range(len(search)):
        text = text.replace(search[i], replace[i])

    text = text.strip()

    return text

def load_file(path):
    df = pd.read_csv(path, sep="\t", header=None)
    df.columns = ["label", "text"]
    df['label'] = df['label'].map({'pos': 1, 'neg': 0})
    df['text'] = df['text'].astype(str).apply(clean_text)
    return df

In [5]:
# Load and preprocess
train_pos = load_file("/content/drive/MyDrive/we_sentiment/Sentiment Analysis/train_Arabic_tweets_positive_20190413.tsv")
train_neg = load_file("/content/drive/MyDrive/we_sentiment/Sentiment Analysis/train_Arabic_tweets_negative_20190413.tsv")
test_pos = load_file("/content/drive/MyDrive/we_sentiment/Sentiment Analysis/test_Arabic_tweets_positive_20190413.tsv")
test_neg = load_file("/content/drive/MyDrive/we_sentiment/Sentiment Analysis/test_Arabic_tweets_negative_20190413.tsv")

# Combine within each split
train_df = pd.concat([train_pos, train_neg], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
test_df = pd.concat([test_pos, test_neg], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
train_df[:5]

Unnamed: 0,label,text
0,1,مقال يدور حول الحمي يوشك ان يقع فيه
1,1,تيشرت القائد
2,0,وكل السعوديه بدو حتي الحاضره منهم من اصل بدوي ...
3,0,المونافري طلع برهوش سدينا
4,0,منو خر بيتهم مثلنا


In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45275 entries, 0 to 45274
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   45275 non-null  int64 
 1   text    45275 non-null  object
dtypes: int64(1), object(1)
memory usage: 707.6+ KB


In [8]:
print(f"Train DataFrame shape: {train_df.shape}")
print(f"Test DataFrame shape: {test_df.shape}")
print(f"Train DataFrame labels: \n{train_df['label'].value_counts()}")
print(f"Test DataFrame labels: \n{test_df['label'].value_counts()}")
print(f"Sample of training data:\n{train_df.head(6)}")

Train DataFrame shape: (45275, 2)
Test DataFrame shape: (11520, 2)
Train DataFrame labels: 
label
1    22761
0    22514
Name: count, dtype: int64
Test DataFrame labels: 
label
0    5768
1    5752
Name: count, dtype: int64
Sample of training data:
   label                                               text
0      1                مقال يدور حول الحمي يوشك ان يقع فيه
1      1                                       تيشرت القائد
2      0  وكل السعوديه بدو حتي الحاضره منهم من اصل بدوي ...
3      0                          المونافري طلع برهوش سدينا
4      0                                 منو خر بيتهم مثلنا
5      1  نقاش حاد محدش يتدخل ياجماعه اللي هيتدخل هيتعور...


In [9]:
train_df, valid_df = train_test_split(train_df, test_size=0.15, random_state=42, stratify=train_df['label'])

## Tokenization

In [None]:
# Initialize Tokenizer
model_name = "asafaya/bert-base-arabic"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/334k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Tokenize Data
def tokenize(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128)

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset_hf = Dataset.from_pandas(train_df[["text", "label"]], preserve_index=False)
test_dataset_hf = Dataset.from_pandas(test_df[["text", "label"]], preserve_index=False)
valid_dataset_hf = Dataset.from_pandas(valid_df[["text", "label"]], preserve_index=False)

# Tokenize the datasets
tokenized_train_dataset = train_dataset_hf.map(tokenize, batched=True)
tokenized_test_dataset = test_dataset_hf.map(tokenize, batched=True)
tokenized_valid_dataset = valid_dataset_hf.map(tokenize, batched=True)

Map:   0%|          | 0/38483 [00:00<?, ? examples/s]

Map:   0%|          | 0/11520 [00:00<?, ? examples/s]

Map:   0%|          | 0/6792 [00:00<?, ? examples/s]

In [12]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 38483
})

In [13]:
# Remove the original text column as it's no longer needed after tokenization
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])
tokenized_valid_dataset = tokenized_valid_dataset.remove_columns(["text"])
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])
tokenized_valid_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "label"])

## Training



In [26]:
num_labels = len(train_df['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-base-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def compute_metrics(pred):
    labels = np.asarray(pred.label_ids)
    preds = np.asarray(np.argmax(pred.predictions, axis=1))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results_sentiment_finetune",
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_sentiment_finetune',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [None]:
# Train the Model
print("\nStarting training...")
trainer.train()

# Evaluate the Model
print("\nEvaluating the fine-tuned model...")
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print(f"Evaluation results: {eval_results}")

# Save the Fine-tuned Model and Tokenizer
output_model_dir = "./fine_tuned_camelbert_sentiment_model"
print(f"\nSaving the fine-tuned model to {output_model_dir}...")
trainer.save_model(output_model_dir)
tokenizer.save_pretrained(output_model_dir)

print("\nFine-tuning complete!")
print(f"Model and tokenizer saved to {output_model_dir}")


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5208,0.483284,0.758687,0.737466,0.814003,0.674085
2,0.424,0.444431,0.794317,0.784513,0.828879,0.744656
3,0.2564,0.465793,0.802267,0.798015,0.820346,0.776867



Evaluating the fine-tuned model...


Evaluation results: {'eval_loss': 0.47885510325431824, 'eval_accuracy': 0.8010416666666667, 'eval_f1': 0.7957219251336899, 'eval_precision': 0.8163862472567667, 'eval_recall': 0.7760778859527121, 'eval_runtime': 82.3083, 'eval_samples_per_second': 139.962, 'eval_steps_per_second': 1.093, 'epoch': 3.0}

Saving the fine-tuned model to ./fine_tuned_camelbert_sentiment_model...

Fine-tuning complete!
Model and tokenizer saved to ./fine_tuned_camelbert_sentiment_model


## Inference Pipeline

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

# Load model & tokenizer
output_model_dir = "./fine_tuned_camelbert_sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(output_model_dir)
model     = AutoModelForSequenceClassification.from_pretrained(output_model_dir)

# Create a pipeline
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=False,
    device=0 if torch.cuda.is_available() else -1
)

# Your samples
sample_texts = [
    "هذا المنتج رائع جدا!",           # Positive
    "تجربة مدهشة وسأكررها بالتأكيد.",  # Positive
    "كانت الأجواء مملة بعض الشيء.",    # Negative
    "الفندق نظيف ومريح والخدمة ممتازة.", # Positive
    "الغرفة كانت متسخة ولم يكن هناك أي اهتمام بالنظافة."  # Negative
]

# Run inference
predictions = pipe(sample_texts, truncation=True, max_length=128)

# Map model labels (0→NEG, 1→POS)
label_map = { "LABEL_0": "negative", "LABEL_1": "positive" }
for text, pred in zip(sample_texts, predictions):
    lbl = label_map.get(pred['label'], pred['label'])
    score = pred['score']
    print(f"\"{text}\" → {lbl} ({score:.3f})")

Device set to use cuda:0


"هذا المنتج رائع جدا!" → positive (0.895)
"تجربة مدهشة وسأكررها بالتأكيد." → positive (0.987)
"كانت الأجواء مملة بعض الشيء." → negative (0.745)
"الفندق نظيف ومريح والخدمة ممتازة." → positive (0.728)
"الغرفة كانت متسخة ولم يكن هناك أي اهتمام بالنظافة." → negative (0.710)
