In [2]:
import re
import csv
import string

import mlflow
import polars as pl

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from pathlib import Path

In [3]:
ROOT_DIR = Path().resolve().parents[0]
MODEL_REGISTRY = Path("/tmp/mlflow")
Path(MODEL_REGISTRY).mkdir(parents=True, exist_ok=True)
MLFLOW_TRACKING_URI = str(ROOT_DIR) + str(MODEL_REGISTRY.absolute())
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print(f"MLFLOW_TRACKING_URI: {MLFLOW_TRACKING_URI}")

MLFLOW_TRACKING_URI: /Users/mamdouh_malaa/Developer/others/Faheem/tmp/mlflow


In [4]:
file_paths = {
    "test_arabic_negative_tweets": "../data/test_arabic_negative_tweets.tsv",
    "test_arabic_positive_tweets": "../data/test_arabic_positive_tweets.tsv",
    "train_arabic_negative_tweets": "../data/train_arabic_negative_tweets.tsv",
    "train_arabic_positive_tweets": "../data/train_arabic_positive_tweets.tsv",
}

In [5]:
def load_data(file_path: str, label: int) -> pl.DataFrame:
    """
    Load data from a TSV file and changing it to a CSV file to easily read from.

    Args:
        file_path (str): The path to the TSV file.
        label (int): The label to assign to each tweet (0 for negative, 1 for positive).

    Returns:
        pl.DataFrame: A Polars DataFrame containing tweets and their labels.
    """
    rows = []
    with open(file_path, newline="", encoding="utf-8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        for row in reader:
            rows.append([row[1], label])
    return pl.DataFrame(rows, schema=["tweet", "label"])


def preprocess_text(text: str) -> str:
    """
    Preprocess a tweet by removing mentions, URLs, punctuation, emojis, diacritics,
    and extra spaces, normalizing Arabic text, and removing repeated characters.

    Args:
        text (str): The tweet text to preprocess.

    Returns:
        str: The cleaned and normalized tweet text.
    """
    arabic_diacritics = re.compile(
        """
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida

                         """,
        re.VERBOSE,
    )

    arabic_punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ"""
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations

    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = re.sub(arabic_diacritics, "", text)

    translator = str.maketrans("", "", punctuations_list)
    text = text.translate(translator)

    text = re.sub(r"(.)\1+", r"\1", text)

    return text

In [6]:
train_neg = load_data(file_paths["train_arabic_negative_tweets"], 0)
train_pos = load_data(file_paths["train_arabic_positive_tweets"], 1)
test_neg = load_data(file_paths["test_arabic_negative_tweets"], 0)
test_pos = load_data(file_paths["test_arabic_positive_tweets"], 1)

train_df = pl.concat([train_neg, train_pos])
test_df = pl.concat([test_neg, test_pos])
df = pl.concat([train_df, test_df])
df = df.sample(fraction=1, shuffle=True)
df.head()

tweet,label
str,i64
"""كسب العيش بالحلال صعب 🌚""",0
"""الحمد لله .. كانت مباراه صعبه …",1
"""احلى من اقضي اجازتي وياها 💛""",1
"""انت الخير ، فوزو اليوم 😒""",0
"""عشرات القتلي والجرحي في #طرابل…",1


In [7]:
df = df.with_columns(
    [
        pl.col("tweet")
        .map_elements(preprocess_text, return_dtype=str)
        .alias("cleaned_tweet")
    ]
)

In [8]:
X = df["cleaned_tweet"].to_numpy()
y = df["label"].to_numpy()

vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [9]:
tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
X_tfidf = tfidf.fit_transform(vectorizer.fit_transform(X)).toarray()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, stratify=y, shuffle=True
)

In [12]:
import mlflow.sklearn
import matplotlib.pyplot as plt


def plot_confusion_matrix(y_test, y_pred, labels):
    cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.savefig("./dump/confusion_matrix.png")
    plt.close()

In [13]:
mlflow.set_experiment("MultinomialNB Experiment")

with mlflow.start_run():
    mlflow.log_param("model_type", "MultinomialNB")

    model = MultinomialNB()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = metrics.accuracy_score(y_test, y_pred)
    report = metrics.classification_report(y_test, y_pred, output_dict=True)

    mlflow.log_metric("accuracy", accuracy)
    for label, metrics_dict in report.items():
        if isinstance(metrics_dict, dict):
            for metric_name, metric_value in metrics_dict.items():
                mlflow.log_metric(f"{label}_{metric_name}", metric_value)

    mlflow.sklearn.log_model(model, "model")

    unique_labels = list(set(y_test))
    plot_confusion_matrix(y_test, y_pred, labels=unique_labels)
    mlflow.log_artifact("./dump/confusion_matrix.png")


print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{metrics.classification_report(y_test, y_pred)}")

Accuracy: 0.7787657364204595
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      5656
           1       0.79      0.77      0.78      5703

    accuracy                           0.78     11359
   macro avg       0.78      0.78      0.78     11359
weighted avg       0.78      0.78      0.78     11359



In [14]:
model.predict(vectorizer.transform(["انا سعيد جدا"]))

array([1])

In [15]:
model.predict(vectorizer.transform(["الله يقلع ام الهلال خرب ام الدوري"]))

array([0])

In [16]:
import pickle

with open("./models/nb_model.pkl", "wb") as file:
    pickle.dump(model, file)