In [39]:
import re
import csv
import string

import mlflow
import polars as pl

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pathlib import Path

In [40]:
ROOT_DIR = Path().resolve().parents[0]
MODEL_REGISTRY = Path("/tmp/mlflow")
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
MLFLOW_TRACKING_URI = str(ROOT_DIR) + str(MODEL_REGISTRY.absolute())
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print(f"MLFLOW_TRACKING_URI: {MLFLOW_TRACKING_URI}")

MLFLOW_TRACKING_URI: /Users/mamdouh_malaa/Developer/others/Faheem/tmp/mlflow


In [41]:
file_paths = {
    "test_arabic_negative_tweets": "../data/test_arabic_negative_tweets.tsv",
    "test_arabic_positive_tweets": "../data/test_arabic_positive_tweets.tsv",
    "train_arabic_negative_tweets": "../data/train_arabic_negative_tweets.tsv",
    "train_arabic_positive_tweets": "../data/train_arabic_positive_tweets.tsv",
}

In [42]:
def load_data(file_path: str, label: int) -> pl.DataFrame:
    """
    Load data from a TSV file and changing it to a CSV file to easily read from.

    Args:
        file_path (str): The path to the TSV file.
        label (int): The label to assign to each tweet (0 for negative, 1 for positive).

    Returns:
        pl.DataFrame: A Polars DataFrame containing tweets and their labels.
    """
    rows = []
    with open(file_path, newline="", encoding="utf-8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        for row in reader:
            rows.append([row[1], label])
    return pl.DataFrame(rows, schema=["tweet", "label"])


def preprocess_text(text: str) -> str:
    """
    Preprocess a tweet by removing mentions, URLs, punctuation, emojis, diacritics,
    and extra spaces, normalizing Arabic text, and removing repeated characters.

    Args:
        text (str): The tweet text to preprocess.

    Returns:
        str: The cleaned and normalized tweet text.
    """
    arabic_diacritics = re.compile(
        """
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida

                         """,
        re.VERBOSE,
    )

    arabic_punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ"""
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations

    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = re.sub(arabic_diacritics, "", text)

    translator = str.maketrans("", "", punctuations_list)
    text = text.translate(translator)

    text = re.sub(r"(.)\1+", r"\1", text)

    return text

In [43]:
train_neg = load_data(file_paths["train_arabic_negative_tweets"], 0)
train_pos = load_data(file_paths["train_arabic_positive_tweets"], 1)
test_neg = load_data(file_paths["test_arabic_negative_tweets"], 0)
test_pos = load_data(file_paths["test_arabic_positive_tweets"], 1)

train_df = pl.concat([train_neg, train_pos])
test_df = pl.concat([test_neg, test_pos])
df = pl.concat([train_df, test_df])
df = df.sample(fraction=1, shuffle=True)
df.head()

tweet,label
str,i64
"""لا زلت أتذكر تلك الرجفه التي ه…",0
"""#صباح_الخير إبتسمو ، فكل ما قد…",1
"""🤔 بيراجع البلدية عساه يلقى موق…",0
"""الإعاقة ليست نقطة ضعف في حياتي…",0
"""فى ناس بتلبس قصير و تشرب خمور …",0


In [44]:
df = df.with_columns(
    [
        pl.col("tweet")
        .map_elements(preprocess_text, return_dtype=str)
        .alias("cleaned_tweet")
    ]
)

In [45]:
X = df["cleaned_tweet"].to_numpy()
y = df["label"].to_numpy()

vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [46]:
vectorizer.vocabulary_

{'لا': 41723,
 'زلت': 31582,
 'اتذكر': 1612,
 'تلك': 25225,
 'الرجفه': 9080,
 'التي': 7743,
 'هزت': 53319,
 'قاع': 39805,
 'قلبي': 40344,
 'حين': 28522,
 'رايت': 30580,
 'فقيدي': 39177,
 'يكفن': 67613,
 'اغفرلابياله': 5090,
 'فقيديابي': 39178,
 'صباحالخير': 34440,
 'ابتسمو': 1156,
 'فكل': 39225,
 'ما': 45117,
 'قدره': 39986,
 'اله': 13880,
 'جميل': 26932,
 'بيراجع': 21349,
 'البلديه': 7075,
 'عساه': 36619,
 'يلقي': 67727,
 'موقف': 50559,
 'الاعاقه': 6258,
 'ليست': 44921,
 'نقطه': 52204,
 'ضعف': 35154,
 'في': 39554,
 'حياتي': 28448,
 'بخفه': 18529,
 'ومهاره': 62285,
 'يلعب': 67699,
 'كونغ': 41609,
 'خانت': 28661,
 'لين': 45014,
 '16': 45,
 'عاما': 36124,
 'بساق': 19074,
 'واحده': 54166,
 'وعكاز': 60497,
 'الاعب': 6259,
 'الوحيد': 14134,
 'ناس': 50860,
 'بتلبس': 18044,
 'قصير': 40228,
 'تشرب': 23724,
 'خمور': 29216,
 'بس': 19063,
 'عندها': 37222,
 'اخلاق': 2836,
 'العكس': 10874,
 'صحيح': 34589,
 'تجد': 22204,
 'بعض': 19678,
 'من': 49897,
 'يصلي': 66750,
 'يصوم': 66767,
 'يحج': 65439,
 'ي

In [47]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
X_tfidf = tfidf.fit_transform(vectorizer.fit_transform(X)).toarray()

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, shuffle=True
)

In [49]:
import mlflow.sklearn
import matplotlib.pyplot as plt


def plot_confusion_matrix(y_test, y_pred, labels):
    cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.savefig("./dump/confusion_matrix.png")
    plt.close()

In [50]:
mlflow.set_experiment("MultinomialNB Experiment")

with mlflow.start_run():
    mlflow.log_param("model_type", "MultinomialNB")

    model = MultinomialNB()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    report = metrics.classification_report(y_test, y_pred, output_dict=True)

    mlflow.log_metric("accuracy", accuracy)
    for label, metrics_dict in report.items():
        if isinstance(metrics_dict, dict):
            for metric_name, metric_value in metrics_dict.items():
                mlflow.log_metric(f"{label}_{metric_name}", metric_value)

    mlflow.sklearn.log_model(model, "model")

    unique_labels = list(set(y_test))
    plot_confusion_matrix(y_test, y_pred, labels=unique_labels)
    mlflow.log_artifact("./dump/confusion_matrix.png")


print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{metrics.classification_report(y_test, y_pred)}")

Accuracy: 0.7843119992957126
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.80      0.79      5656
           1       0.80      0.76      0.78      5703

    accuracy                           0.78     11359
   macro avg       0.78      0.78      0.78     11359
weighted avg       0.78      0.78      0.78     11359





In [54]:
model.predict(vectorizer.transform(["انا سعيد جدا"]))

array([1])

In [55]:
model.predict(vectorizer.transform(["الله يقلع ام الهلال خرب ام الدوري"]))

array([0])

In [57]:
import pickle

with open("./models/nb_model.pkl", "wb") as file:
    pickle.dump(model, file)