# Read in Data

In [None]:
import pandas as pd
import numpy as np
from typing import Optional

In [None]:
from custom_datasets import MultiLangDataset, SplitSet
from custom_datasets import ns_dataset
from custom_datasets import Languages
from custom_datasets import clean_tweet
from constants import TokenizerEnum, VectorizerEnum
from custom_vectorizers import get_vectorizer

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout,Input, BatchNormalization, Activation
from tensorflow.keras.utils import to_categorical

In [None]:
YOR_DATASET: SplitSet = ns_dataset.get(Languages.YORUBA)
HAU_DATASET: SplitSet = ns_dataset.get(Languages.HAUSA)
IBO_DATASET: SplitSet = ns_dataset.get(Languages.IGBO)
PCM_DATASET: SplitSet = ns_dataset.get(Languages.NIGERIAN_PIDGIN)

In [None]:
# Evaluator
from analysis import compare_results

# Tokenizer
from subword_tokenizer import (
    get_tokenizer,
    wordpiece_tokenize_dataframe,
    get_sentencepiece_tokenizer,
    sentencepiece_tokenize_dataframe,
    get_wordpiece_tokeized_data,
    get_sentencepiece_tokeized_data
)

# Compare Results
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def encode_labels(df: pd.DataFrame):
    label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
    df["label_encoded"] = df["label"].str.lower().map(label_mapping)

In [None]:
# VECTORIZER_KWARGS = {
#     "ngram": (1, 2),
#     "max_features": 3700,}

VECTORIZER_KWARGS = {}

# Naive Bayes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


def naive_bayes(
    dataset: SplitSet, vectorizer: VectorizerEnum,  tokenizer: Optional[TokenizerEnum] = None
) -> dict:
    """
    Naive Bayes classifier for text classification.
    Args:
        dataset (SplitSet): The dataset to use for training and testing.
        tokenizer (TokenizerEnum): The tokenizer to use.
        If tokenizer is None, then no tokenization is applied. Only vectorization is applied.
        vectorizer (VectorizerEnum): The vectorizer to use.
    Returns:
        dict: A dictionary containing the results of the classification.
        Returns a classification report
    """
    
    selected_vectorizer = get_vectorizer(vectorizer)
    
    
    data: pd.DataFrame = dataset.train
    
    if tokenizer is not None:
        if tokenizer == TokenizerEnum.WORDPIECE:
            tokenized_data = wordpiece_tokenize_dataframe(data, get_tokenizer(data))
        elif tokenizer == TokenizerEnum.SENTENCEPIECE:
            tokenized_data = sentencepiece_tokenize_dataframe(data, get_tokenizer(data))
        else:
            raise ValueError(f"Unsupported tokenizer: {tokenizer}")
        tokenized_list = tokenized_data["tokenized_tweets"].tolist()
        

        # join sub lists into strings
        tokenized_list = [" ".join(tokens) for tokens in tokenized_list]

        y = tokenized_data["label"].tolist()

        
        tokenized_vectorized_data, vectorizer_wp = selected_vectorizer(
            tokenized_list, **VECTORIZER_KWARGS
        )

        X_train, X_test, y_train, y_test = train_test_split(
            tokenized_vectorized_data, y, test_size=0.3, random_state=42
        )
    else:
        vectorized_data, trained_vectorizer = selected_vectorizer(data["tweet"].tolist(), **VECTORIZER_KWARGS)
        X_train, X_test, y_train, y_test = train_test_split(
            vectorized_data, data["label"], test_size=0.3, random_state=42
        )
    model = MultinomialNB()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    # accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)
    return report

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


def logistic_regression(
    dataset: SplitSet, vectorizer: VectorizerEnum,  tokenizer: Optional[TokenizerEnum] = None
) -> dict:
    """
    Logistic Regression classifier for text classification.
    Args:
        dataset (SplitSet): The dataset to use for training and testing.
        tokenizer (TokenizerEnum): The tokenizer to use.
        If tokenizer is None, then no tokenization is applied. Only vectorization is applied.
        vectorizer (VectorizerEnum): The vectorizer to use.
    Returns:
        dict: A dictionary containing the results of the classification.
        Returns a classification report
    """
    selected_vectorizer = get_vectorizer(vectorizer)
    
    
    data: pd.DataFrame = dataset.train
    
    if tokenizer is not None:
        if tokenizer == TokenizerEnum.WORDPIECE:
            tokenized_data = wordpiece_tokenize_dataframe(data, get_tokenizer(data))
        elif tokenizer == TokenizerEnum.SENTENCEPIECE:
            tokenized_data = sentencepiece_tokenize_dataframe(data, get_tokenizer(data))
        else:
            raise ValueError(f"Unsupported tokenizer: {tokenizer}")
        tokenized_list = tokenized_data["tokenized_tweets"].tolist()
        

        # join sub lists into strings
        tokenized_list = [" ".join(tokens) for tokens in tokenized_list]

        y = tokenized_data["label"].tolist()

        
        tokenized_vectorized_data, vectorizer_wp = selected_vectorizer(
            tokenized_list, **VECTORIZER_KWARGS
        )

        X_train, X_test, y_train, y_test = train_test_split(
            tokenized_vectorized_data, y, test_size=0.3, random_state=42
        )
    else:
        vectorized_data, trained_vectorizer = selected_vectorizer(data["tweet"].tolist(), **VECTORIZER_KWARGS)
        X_train, X_test, y_train, y_test = train_test_split(
            vectorized_data, data["label"], test_size=0.3, random_state=42
        )
    model = LogisticRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    # accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)
    return report

# Neural Network

In [None]:



def neural_network(
    dataset: SplitSet, vectorizer: VectorizerEnum,  tokenizer: Optional[TokenizerEnum] = None, clean_tweets: bool = True
) -> dict:
    """
    Neural Network classifier for text classification.
    Args:
        dataset (SplitSet): The dataset to use for training and testing.
        tokenizer (TokenizerEnum): The tokenizer to use.
        If tokenizer is None, then no tokenization is applied. Only vectorization is applied.
        vectorizer (VectorizerEnum): The vectorizer to use.
    Returns:
        dict: A dictionary containing the results of the classification.
        Returns a classification report
    """
    pass

    df = dataset.train
    # encode_labels(df)
    if clean_tweets:
        df['cleaned_tweet'] = df['tweet'].apply(clean_tweet)
    encode_labels(df)

    if tokenizer is not None:
        neural_input = get_wordpiece_tokeized_data(
            df,
            vocab_size=3700,
            tweet_column="cleaned_tweet",
            vectorizer_kwargs={"ngram": (1, 2), "max_features": None},
        )
    elif tokenizer == TokenizerEnum.SENTENCEPIECE:
        neural_input = get_sentencepiece_tokeized_data(
            df,
            vocab_size=3700,
            tweet_column="cleaned_tweet",
            vectorizer_kwargs={"ngram": (1, 2), "max_features": None},
        )

    model = Sequential()
    model.add(Input(shape=(neural_input.X_train.shape[1],)))

    # Dense layers for TF-IDF input
    # (512, 256, 128)
    # (8, 4, 2)
    model.add(Dense(512))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.4))
    model.add(Dense(256))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.2))
    model.add(Dense(128))
    model.add(BatchNormalization())
    model.add(Dense(3, activation="softmax"))

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=Adam(),
        metrics=["accuracy"],
    )

    X = np.array(neural_input.get_dense_X_train())
    y = np.array(neural_input.y_train)

    model.fit(X, y, epochs=10, batch_size=64, verbose=1)

    # Evaluate the model
    # model.evaluate(neural_input.X_test, neural_input.y_test)
    y_pred = model.predict(
        np.array(neural_input.get_dense_X_test()),
    )
    y_pred_classes = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(neural_input.y_test, y_pred_classes)
    print(f"Accuracy with filtered tweets {accuracy:.4f}")

    
    return classification_report(
            neural_input.y_test,
            y_pred_classes,
            target_names=["positive", "neutral", "negative"],
            output_dict=True
        )
    

In [None]:
nn_yor_kwargs ={
    "nn_yor_wp_clean":{
        "dataset": YOR_DATASET,
        "vectorizer": VectorizerEnum.TFIDF,
        "tokenizer": TokenizerEnum.WORDPIECE,
        "clean_tweets": True
    },
    "nn_yor_wp_no_clean":{
        "dataset": YOR_DATASET,
        "vectorizer": VectorizerEnum.TFIDF,
        "tokenizer": TokenizerEnum.WORDPIECE,
        "clean_tweets": False
    },
    "nn_yor_sp_clean" : {
        "dataset": YOR_DATASET,
        "vectorizer": VectorizerEnum.TFIDF,
        "tokenizer": TokenizerEnum.SENTENCEPIECE,
        "clean_tweets": True
    },
    "nn_yor_sp_no_clean" : {
        "dataset": YOR_DATASET,
        "vectorizer": VectorizerEnum.TFIDF,
        "tokenizer": TokenizerEnum.SENTENCEPIECE,
        "clean_tweets": False
    }
}

nn_yor_wp = neural_network(**nn_yor_kwargs["nn_yor_wp_clean"])
nn_yor_wp_no_clean = neural_network(**nn_yor_kwargs["nn_yor_wp_no_clean"])
nn_yor_sp = neural_network(**nn_yor_kwargs["nn_yor_sp_clean"])
nn_yor_sp_no_clean = neural_network(**nn_yor_kwargs["nn_yor_sp_no_clean"])


compare_results(
    normal_result=nn_yor_wp,
    subword_result=nn_yor_sp,
)

compare_results(
    normal_result=nn_yor_sp,
    subword_result=nn_yor_wp,
)

compare_results(
    normal_result=nn_yor_wp_no_clean,
    subword_result=nn_yor_sp_no_clean,
)

In [None]:
result1 = neural_network(
    dataset=YOR_DATASET,
    vectorizer=VectorizerEnum.TFIDF,
    tokenizer=TokenizerEnum.WORDPIECE,
)
result2 = neural_network(
    dataset=YOR_DATASET,
    vectorizer=VectorizerEnum.TFIDF,
    tokenizer=TokenizerEnum.WORDPIECE,
    clean_tweets=False
)

compare_results(normal_result=result1, subword_result=result2)

In [None]:
result1 = logistic_regression(dataset=YOR_DATASET, vectorizer=VectorizerEnum.TFIDF)
result2 = logistic_regression(
    dataset=YOR_DATASET,
    vectorizer=VectorizerEnum.TFIDF,
    tokenizer=TokenizerEnum.WORDPIECE,
)

compare_results(normal_result=result1, subword_result=result2)

In [None]:
result1 = naive_bayes(dataset=YOR_DATASET, vectorizer=VectorizerEnum.TFIDF)
result2 = naive_bayes(
    dataset=YOR_DATASET,
    vectorizer=VectorizerEnum.TFIDF,
    tokenizer=TokenizerEnum.WORDPIECE,
)

compare_results(normal_result=result1, subword_result=result2)