In [18]:
import re
import csv
import string

import numpy as np
import polars as pl

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

In [19]:
file_paths = {
    "test_arabic_negative_tweets": "../data/test_arabic_negative_tweets.tsv",
    "test_arabic_positive_tweets": "../data/test_arabic_positive_tweets.tsv",
    "train_arabic_negative_tweets": "../data/train_arabic_negative_tweets.tsv",
    "train_arabic_positive_tweets": "../data/train_arabic_positive_tweets.tsv",
}

In [20]:
def load_data(file_path: str, label: int) -> pl.DataFrame:
    """
    Load data from a TSV file and changing it to a CSV file to easily read from.

    Args:
        file_path (str): The path to the TSV file.
        label (int): The label to assign to each tweet (0 for negative, 1 for positive).

    Returns:
        pl.DataFrame: A Polars DataFrame containing tweets and their labels.
    """
    rows = []
    with open(file_path, newline="", encoding="utf-8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        for row in reader:
            rows.append([row[1], label])
    return pl.DataFrame(rows, schema=["tweet", "label"])


def preprocess_text(text: str) -> str:
    """
    Preprocess a tweet by removing mentions, URLs, punctuation, emojis, diacritics,
    and extra spaces, normalizing Arabic text, and removing repeated characters.

    Args:
        text (str): The tweet text to preprocess.

    Returns:
        str: The cleaned and normalized tweet text.
    """
    arabic_diacritics = re.compile(
        """
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida

                         """,
        re.VERBOSE,
    )

    arabic_punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ"""
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations

    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = re.sub(arabic_diacritics, "", text)

    translator = str.maketrans("", "", punctuations_list)
    text = text.translate(translator)

    text = re.sub(r"(.)\1+", r"\1", text)

    return text

In [21]:
train_neg = load_data(file_paths["train_arabic_negative_tweets"], 0)
train_pos = load_data(file_paths["train_arabic_positive_tweets"], 1)
test_neg = load_data(file_paths["test_arabic_negative_tweets"], 0)
test_pos = load_data(file_paths["test_arabic_positive_tweets"], 1)

train_df = pl.concat([train_neg, train_pos])
test_df = pl.concat([test_neg, test_pos])
df = pl.concat([train_df, test_df])
df = df.sample(fraction=1, shuffle=True)
df.head()

tweet,label
str,i64
"""💫 ربنا عليك متوكلين، وبك نظن ا…",1
"""هو ال celebrity crush بتاع كل …",0
"""صراحة: والحل 😞""",0
"""ايه يا جماعه فى ايه 🤔""",0
"""شوفو كيف لازم لمن ياخد منها ع …",0


In [22]:
df = df.with_columns(
    [
        pl.col("tweet")
        .map_elements(preprocess_text, return_dtype=str)
        .alias("cleaned_tweet")
    ]
)

In [23]:
X = df["cleaned_tweet"].to_numpy()
y = df["label"].to_numpy()

vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [24]:
vectorizer.vocabulary_

{'ربنا': 30638,
 'عليك': 37044,
 'متوكلين': 47213,
 'وبك': 57268,
 'نظن': 51949,
 'الظن': 10572,
 'الجميل': 8018,
 'الهم': 13992,
 'قدرا': 39975,
 'جميلا': 26933,
 'وخيرا': 58901,
 'يتبعه': 64896,
 'الرض': 9146,
 'هو': 53613,
 'ال': 5676,
 'celebrity': 309,
 'crush': 346,
 'بتاع': 17588,
 'كل': 41273,
 'الاجيال': 5833,
 'واله': 56514,
 'صراحه': 34691,
 'والحل': 55555,
 'ايه': 15927,
 'يا': 63767,
 'جماعه': 26854,
 'في': 39554,
 'شوفو': 34167,
 'كيف': 41663,
 'لازم': 42140,
 'لمن': 44457,
 'ياخد': 63951,
 'منها': 50205,
 'طول': 35694,
 'يطالع': 66846,
 'فيها': 39703,
 'ويقرا': 63476,
 'عشان': 36650,
 'كدا': 40986,
 'انبسط': 14806,
 'تعطيه': 24374,
 'مره': 48097,
 'واضح': 54742,
 'ينبسط': 67937,
 'بتاعه': 17600,
 'كلهنقطه': 41385,
 'ضعفي': 35163,
 'ذي': 30360,
 'الشغلات': 9969,
 'تغلي': 24577,
 'براحتك': 18772,
 'انا': 14754,
 'ابسط': 1264,
 'نفسي': 52094,
 'بنفسي': 20747,
 'لا': 41723,
 'احبك': 2209,
 'فقطبل': 39156,
 'استند': 3850,
 'كانك': 40799,
 'اكثر': 5568,
 'الاشياء': 6197,
 'ثبا

In [25]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
X_tfidf = tfidf.fit_transform(vectorizer.fit_transform(X)).toarray()

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, shuffle=True
)

In [27]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)

In [29]:
accuracy = metrics.accuracy_score(y_test, y_pred)
report = metrics.classification_report(y_test, y_pred)

In [30]:
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.7801743111189365
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      5656
           1       0.80      0.76      0.78      5703

    accuracy                           0.78     11359
   macro avg       0.78      0.78      0.78     11359
weighted avg       0.78      0.78      0.78     11359



In [31]:
model.predict(vectorizer.transform(["انا سعيد جدا"]))

array([1])

In [32]:
model.predict(vectorizer.transform(["الله يقلع ام الهلال خرب ام الدوري"]))

array([0])

In [34]:
import pickle

with open("./models/nb_model.pkl", "wb") as file:
    pickle.dump(model, file)