In [35]:
import re

import polars as pl
import csv

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [36]:
file_paths = {
    "test_arabic_negative_tweets": "../data/test_arabic_negative_tweets.tsv",
    "test_arabic_positive_tweets": "../data/test_arabic_positive_tweets.tsv",
    "train_arabic_negative_tweets": "../data/train_arabic_negative_tweets.tsv",
    "train_arabic_positive_tweets": "../data/train_arabic_positive_tweets.tsv",
}

In [None]:
def load_data(file_path: str, label: int) -> pl.DataFrame:
    """
    Load data from a TSV file and assign a label to each row.

    Args:
        file_path (str): The path to the TSV file.
        label (int): The label to assign to each tweet (0 for negative, 1 for positive).

    Returns:
        pl.DataFrame: A Polars DataFrame containing tweets and their labels.
    """
    rows = []
    with open(file_path, newline="", encoding="utf-8") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")
        for row in reader:
            rows.append([row[1], label])
    return pl.DataFrame(rows, schema=["tweet", "label"])


def preprocess_text(text: str) -> str:
    """
    Preprocess a tweet by removing mentions, URLs, punctuation, and extra spaces.

    Args:
        text (str): The tweet text to preprocess.

    Returns:
        str: The cleaned tweet text.
    """
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text