In [None]:
import os
import pandas as pd
import subprocess
from typing import Dict, List, Optional
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout,Input, BatchNormalization, Activation
from tensorflow.keras.utils import to_categorical

from models import BasicModelEncapsulator, NeuralNetworkModel
from custom_vectorizers import initialise_count_vectorizer, initialise_tfidf_vectorizer
from constants import LOCAL_DIR_AS, LOCAL_DIR_NS, REPO_URL_AS, REPO_URL_NS, NS_LANGUAGES
from custom_datasets import MultiLangDataset, load_local_datasets

from custom_datasets import Languages


In [None]:
def clone_repo(repo_url: str, local_dir: str) -> None:
    if os.path.isdir(local_dir):
        print("Repository exists. Updating...")
        subprocess.run(["git", "-C", local_dir, "pull", "origin", "main"], check=True)
    else:
        print("Repository not found. Cloning...")
        subprocess.run(["git", "clone", repo_url], check=True)

clone_repo(REPO_URL_NS, LOCAL_DIR_NS)
clone_repo(REPO_URL_AS, LOCAL_DIR_AS)


In [None]:
ns_dataset: MultiLangDataset = load_local_datasets(local_base_dir=LOCAL_DIR_NS + '/data/annotated_tweets', languages=NS_LANGUAGES) 

In [None]:
as_dataset: MultiLangDataset = load_local_datasets(local_base_dir=f'afrisent-semeval-2023/data', languages=NS_LANGUAGES,)

In [None]:
print("NaijaSenti dataset loaded with languages:", ns_dataset.all_languages())
print("Afrisenti dataset loaded with languages:", as_dataset.all_languages())

In [None]:


print("NaijaSenti hau: ", ns_dataset.get(Languages.HAUSA).test)
# Print each row in the dev set for the column 'tweet'
for index, row in ns_dataset.get(Languages.HAUSA).test.iterrows():
    print(f"Index: {index}, Tweet: {row['tweet']}")

# write all the tweets into a textfile
# check if the dir data exists, if not create it
if not os.path.exists('data'):
    os.makedirs('data')
with open('data/naija_senti_hau_dev_tweets.txt', 'w', encoding='utf-8') as f:
    for index, row in ns_dataset.get(Languages.HAUSA).dev.iterrows():
        f.write(f"{row['tweet']}\n")

In [None]:
df = ns_dataset.get(Languages.HAUSA).train
text_train, text_test, y_train, y_test = train_test_split(df.tweet, df.label, test_size = 0.3)
X_train_tfidf, vectorizer_tfidf = initialise_tfidf_vectorizer(text_train)
X_train_count, vectorizer_count = initialise_count_vectorizer(text_train)

In [None]:
def encode_labels(df):
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    df['label_encoded'] = df['label'].str.lower().map(label_mapping)

In [None]:
from subword_tokenizer import get_tokenizer, wordpiece_tokenize_dataframe
class NeuralNetworkInput:
    def __init__(self, X_train, y_train, X_test, y_test, num_classes, num_features):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.num_classes = num_classes
        self.num_features = num_features


    def get_dense_X_train(self):
        X_train = self.X_train.toarray()
        return X_train.astype(np.float32)

    def get_dense_X_test(self):
        X_test = self.X_test.toarray()
        return X_test.astype(np.float32)
    def get_input_shape(self):
        return (self.X_train.shape[1],)

    def get_num_classes(self):
        return self.num_classes

    def get_num_features(self):
        return self.num_features
    
def get_wordpiece_tokeized_data(df, vocab_size=None, vectorizer_kwargs: Optional[Dict] = None,tweet_column: str = "tweet") -> NeuralNetworkInput:
    tokenizer = get_tokenizer(df=df, vocab_size=vocab_size, tweet_column=tweet_column)

    train_df = df

    # train_df = encode_labels(train_df)

    # Naive Bayes with wordpiece tokenized data
    wp_train_df = wordpiece_tokenize_dataframe(train_df, tokenizer, )

    wp_X_train_list = wp_train_df['tokenized_tweets'].tolist()

    # join sub lists into strings
    wp_X_train_list = [' '.join(tokens) for tokens in wp_X_train_list]


    wp_y_train = wp_train_df['label_encoded'].tolist()

    if vectorizer_kwargs is None:
        vectorizer_kwargs = {}
    tfidf_wp_train, vectorizer_wp = initialise_tfidf_vectorizer(wp_X_train_list, **vectorizer_kwargs)


    wp_tfidf_features = tfidf_wp_train.shape[1]  # Number of TF-IDF features
    wp_num_classes = len(np.unique(wp_y_train)) 

    X_train, X_test, y_train, y_test = train_test_split(tfidf_wp_train, wp_y_train, test_size=0.3, random_state=42)
    return NeuralNetworkInput(X_train, y_train, X_test, y_test, wp_num_classes, wp_tfidf_features)
    

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Input,
    BatchNormalization,
    Activation,
)
from tensorflow.keras.utils import to_categorical




In [None]:
# vocab_list = [1000,1500, 2000,2250, 2500,2750, 3000, 4000, 5000,3000, 6000, 8000, 10000, 12000, 15000, 20000]
vocab_accuracy = {}
# for index in vocab_list:
# for index in range(2000, 12000, 100):
#     print(f"Training model with vocabulary size: {index}")
#     df = ns_dataset.get(Languages.HAUSA).train
#     encode_labels(df)
#     neural_input = get_wordpiece_tokeized_data(df, vocab_size=index)

#     model = Sequential()
#     model.add(Input(shape=(neural_input.X_train.shape[1],))) 

#     # Dense layers for TF-IDF input
#     model.add(Dense(256))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
#     model.add(Dropout(0.4))
#     model.add(Dense(128))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
#     model.add(Dropout(0.3))
#     model.add(Dense(64))
#     model.add(BatchNormalization())
#     model.add(Dense(3, activation="softmax"))

#     model.compile(
#     loss="sparse_categorical_crossentropy",
#     optimizer=Adam(),
#     metrics=["accuracy"],
#     )
   
#     X = np.array(neural_input.get_dense_X_train())
#     y = np.array(neural_input.y_train)

    
#     model.fit(X, y, epochs=10, batch_size=32, verbose=0)

#     # Evaluate the model
# #model.evaluate(neural_input.X_test, neural_input.y_test)
#     y_pred = model.predict(np.array(neural_input.get_dense_X_test()), )
#     y_pred_classes = np.argmax(y_pred, axis=1)
#     accuracy = accuracy_score(neural_input.y_test, y_pred_classes)
#     print(f"Accuracy for vocabulary size {index}: {accuracy:.4f}")
#     vocab_accuracy[index] = accuracy
#     print(classification_report(neural_input.y_test, y_pred_classes, target_names=['positive', 'neutral', 'negative']))
#     print("\n" + "="*50 + "\n")

In [None]:
# sort the vocab_accuracy dictionary by keys (vocabulary size)

print("Vocabulary size vs Accuracy:")
for vocab_size, accuracy in vocab_accuracy.items():
    print(f"Vocabulary Size: {vocab_size}, Accuracy: {accuracy:.4f}")

# save the vocab_accuracy dictionary to a file
import json
with open('vocab_accuracy.json', 'w') as f:
    json.dump(vocab_accuracy, f)

In [None]:
# vectorizer_params = [{'ngram': (1,2), 'max_features': 5000},
#     {'ngram': (1,3), 'max_features': 5000},
#     {'ngram': (1,2), 'max_features': 10000},
#     {'ngram': (1,3), 'max_features': 10000},
#     {'ngram': (1,2), 'max_features': None},
#     {'ngram': (1,3), 'max_features': None},
#     {'ngram': (1,2), 'max_features': 2000},
#     {'ngram': (1,3), 'max_features': 2000},
#     {'ngram': (1,2), 'max_features': 3000},
#     {'ngram': (1,3), 'max_features': 3000},
#     {'ngram': (1,2), 'max_features': 4000},
#     {'ngram': (1,3), 'max_features': 4000},
#     {'ngram': (1,2), 'max_features': 6000},
#     {'ngram': (1,3), 'max_features': 6000},
#     {'ngram': (1,2), 'max_features': 7000},
#     {'ngram': (1,3), 'max_features': 7000},
#     {'ngram': (1,2), 'max_features': 8000},
#     {'ngram': (1,3), 'max_features': 8000},
#     {'ngram': (1,2), 'max_features': 9000},
#     {'ngram': (1,3), 'max_features': 9000},
#     {'ngram': (1,2), 'max_features': 10000},
#     {'ngram': (1,3), 'max_features': 10000},
#     {'ngram': (1,2), 'max_features': 12000},
#     {'ngram': (1,4), 'max_features': 5000},
#     {'ngram': (1,4), 'max_features': 10000},
#     {'ngram': (1,4), 'max_features': None},
#     {'ngram': (1,4), 'max_features': 2000},
#     {'ngram': (1,4), 'max_features': 3000},
#     {'ngram': (1,4), 'max_features': 4000},
#     {'ngram': (1,4), 'max_features': 6000},
#     {'ngram': (1,4), 'max_features': 7000},
#     {'ngram': (1,4), 'max_features': 8000},
#     {'ngram': (1,4), 'max_features': 9000},
#     {'ngram': (1,4), 'max_features': 10000},
#     {'ngram': (1,4), 'max_features': 12000},
#     {'ngram': (2,5), 'max_features': 5000},
#     {'ngram': (2,5), 'max_features': 10000},
#     {'ngram': (2,5), 'max_features': None},
#     {'ngram': (2,5), 'max_features': 2000},
#     {'ngram': (2,5), 'max_features': 3000},
#     {'ngram': (2,5), 'max_features': 4000},
#     {'ngram': (2,5), 'max_features': 6000},
#     {'ngram': (2,5), 'max_features': 7000},
#     {'ngram': (2,5), 'max_features': 8000},
#     {'ngram': (2,5), 'max_features': 9000},
#     {'ngram': (2,5), 'max_features': 10000},
#     {'ngram': (2,5), 'max_features': 12000},
#     {'ngram': (3,5), 'max_features': 5000},
#     {'ngram': (3,5), 'max_features': 10000},
#     {'ngram': (3,5), 'max_features': None},
#     {'ngram': (3,5), 'max_features': 2000},
#     {'ngram': (3,5), 'max_features': 3000},
#     {'ngram': (3,5), 'max_features': 4000},
#     {'ngram': (3,5), 'max_features': 6000},
#     {'ngram': (3,5), 'max_features': 7000},
#     {'ngram': (3,5), 'max_features': 8000},
#     {'ngram': (3,5), 'max_features': 9000},
#     {'ngram': (3,5), 'max_features': 10000},
#     {'ngram': (3,5), 'max_features': 12000}]
# ngram = (1, 2) and max features adjustments
vectorizer_params = [
    {'ngram': (1, 2), 'max_features': 5000},
    {'ngram': (1, 2), 'max_features': 10000},
    {'ngram': (1, 2), 'max_features': None},
    {'ngram': (1, 2), 'max_features': 2000},
    {'ngram': (1, 2), 'max_features': 3000},
    {'ngram': (1, 2), 'max_features': 4000},
    {'ngram': (1, 2), 'max_features': 6000},
    {'ngram': (1, 2), 'max_features': 7000},
    {'ngram': (1, 2), 'max_features': 8000},
    {'ngram': (1, 2), 'max_features': 9000},
    {'ngram': (1, 2), 'max_features': 10000},
    {'ngram': (1, 2), 'max_features': 12000},
    {'ngram': (1, 2), 'max_features': 15000},
    {'ngram': (1, 2), 'max_features': 20000},
    {'ngram': (1, 2), 'max_features': 500},
    {'ngram': (1, 2), 'max_features': 1000},
    {'ngram': (1, 2), 'max_features': 1500},
    {'ngram': (1, 2), 'max_features': 2500},
    {'ngram': (1, 2), 'max_features': 3000},
    {'ngram': (1, 2), 'max_features': 4000},
]
vocab_accuracy = {}


# for index in vocab_list:
# for index in vectorizer_params:
#     print(f"Training model with vocabulary size: {index}")
#     df = ns_dataset.get(Languages.HAUSA).train
#     encode_labels(df)
#     neural_input = get_wordpiece_tokeized_data(df, vocab_size=8000, vectorizer_kwargs=index)

#     model = Sequential()
#     model.add(Input(shape=(neural_input.X_train.shape[1],))) 

#     # Dense layers for TF-IDF input
#     model.add(Dense(256))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
#     model.add(Dropout(0.4))
#     model.add(Dense(128))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
#     model.add(Dropout(0.3))
#     model.add(Dense(64))
#     model.add(BatchNormalization())
#     model.add(Dense(3, activation="softmax"))

#     model.compile(
#     loss="sparse_categorical_crossentropy",
#     optimizer=Adam(),
#     metrics=["accuracy"],
#     )
   
#     X = np.array(neural_input.get_dense_X_train())
#     y = np.array(neural_input.y_train)

    
#     model.fit(X, y, epochs=10, batch_size=32, verbose=0)

#     # Evaluate the model
# #model.evaluate(neural_input.X_test, neural_input.y_test)
#     y_pred = model.predict(np.array(neural_input.get_dense_X_test()), )
#     y_pred_classes = np.argmax(y_pred, axis=1)
#     accuracy = accuracy_score(neural_input.y_test, y_pred_classes)
#     print(f"Accuracy for vocabulary size {index}: {accuracy:.4f}")
#     vocab_accuracy[str(index)] = accuracy
#     print(classification_report(neural_input.y_test, y_pred_classes, target_names=['positive', 'neutral', 'negative']))
#     print("\n" + "="*50 + "\n")

In [None]:
# print("Vocabulary size vs Accuracy:")
for vocab_size, accuracy in vocab_accuracy.items():
    print(f"Vocabulary Size: {vocab_size}, Accuracy: {accuracy:.4f}")

# save the vocab_accuracy dictionary to a file
import json
with open('vectorizer_params_accuracy.json', 'w') as f:
    json.dump(vocab_accuracy, f, indent=4)

In [None]:
# Dense layers size tuple list (3 items each)
dense_layer_sizes = [
    (256, 128, 64),
    (512, 256, 128),
    (1024, 512, 256),
    (128, 64, 32),
    (64, 32, 16),
    (32, 16, 8),
    (16, 8, 4),
    (8, 4, 2),
    (256, 256, 128),
    (512, 512, 256),
    (1024, 1024, 512),
    (128, 128, 64),
    (64, 64, 32),
    (32, 32, 16),
    (16, 16, 8),
    (8, 8, 4),
    (256, 128, 64),
    (512, 256, 128),
    (1024, 512, 256),
    (2048, 1024, 512),
    
]
dense_layer_accuracy = {}

# for index in vocab_list:
# for index in dense_layer_sizes:
#     print(f"Training model with vocabulary size: {index}")
#     df = ns_dataset.get(Languages.HAUSA).train
#     encode_labels(df)
#     neural_input = get_wordpiece_tokeized_data(
#         df, vocab_size=8000,
#     )

#     model = Sequential()
#     model.add(Input(shape=(neural_input.X_train.shape[1],)))

#     # Dense layers for TF-IDF input
#     model.add(Dense(index[0]))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
#     model.add(Dropout(0.4))
#     model.add(Dense(index[1]))
#     model.add(BatchNormalization())
#     model.add(Activation("relu"))
#     model.add(Dropout(0.3))
#     model.add(Dense(index[2]))
#     model.add(BatchNormalization())
#     model.add(Dense(3, activation="softmax"))

#     model.compile(
#         loss="sparse_categorical_crossentropy",
#         optimizer=Adam(),
#         metrics=["accuracy"],
#     )

#     X = np.array(neural_input.get_dense_X_train())
#     y = np.array(neural_input.y_train)

#     model.fit(X, y, epochs=10, batch_size=32, verbose=0)

#     # Evaluate the model
#     # model.evaluate(neural_input.X_test, neural_input.y_test)
#     y_pred = model.predict(
#         np.array(neural_input.get_dense_X_test()),
#     )
#     y_pred_classes = np.argmax(y_pred, axis=1)
#     accuracy = accuracy_score(neural_input.y_test, y_pred_classes)
#     print(f"Accuracy for vocabulary size {index}: {accuracy:.4f}")
#     dense_layer_accuracy[str(index)] = accuracy
#     print(
#         classification_report(
#             neural_input.y_test,
#             y_pred_classes,
#             target_names=["positive", "neutral", "negative"],
#         )
#     )
#     print("\n" + "=" * 50 + "\n")

In [None]:
for size, accuracy in dense_layer_accuracy.items():
    print(f"Dense layer Size: {size}, Accuracy: {accuracy:.4f}")

# save the vocab_accuracy dictionary to a file
import json
with open('dense_layer_accuracy.json', 'w') as f:
    json.dump(dense_layer_accuracy, f, indent=4)

In [None]:
# Try cleaned tweets with wordpiece tokenization


import re


def clean_tweet(tweet):
    """
    Clean tweet by replacing punctuation, emojis, and @mentions with whitespaces
    """
    if pd.isna(tweet):
        return tweet
    # print (f"Original Tweet: {tweet}")
    # Convert to string in case of mixed types
    tweet = str(tweet)

    # Remove @mentions (replace with space)
    tweet = re.sub(r"@\w+", " ", tweet)

    # Remove punctuation (replace with space)
    tweet = re.sub(r"[^\w\s]", " ", tweet)

    # Remove emojis (replace with space)
    # This regex matches most Unicode emoji ranges
    emoji_pattern = re.compile(
        "["
        "\U0001f600-\U0001f64f"  # emoticons
        "\U0001f300-\U0001f5ff"  # symbols & pictographs
        "\U0001f680-\U0001f6ff"  # transport & map symbols
        "\U0001f1e0-\U0001f1ff"  # flags (iOS)
        "\U00002500-\U00002bef"  # chinese char
        "\U00002702-\U000027b0"
        "\U00002702-\U000027b0"
        "\U000024c2-\U0001f251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2b55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )

    tweet = emoji_pattern.sub(" ", tweet)

    # Replace multiple consecutive spaces with single space
    tweet = re.sub(r"\s+", " ", tweet)

    # Strip leading and trailing whitespace
    tweet = tweet.strip()

    # Compare 5 normal tweets with cleaned tweets
    
    # Return cleaned tweet
    print(f"Cleaned Tweet: {tweet}")

    return tweet


df = ns_dataset.get(Languages.HAUSA).train
encode_labels(df)

# df['cleaned_tweet'] = df['tweet'].apply(clean_tweet)

neural_input = get_wordpiece_tokeized_data(
    df,
    vocab_size=3700,
    tweet_column='cleaned_tweet',
    vectorizer_kwargs={'ngram': (1, 2), 'max_features': None},
    
)

model = Sequential()
model.add(Input(shape=(neural_input.X_train.shape[1],)))

# Dense layers for TF-IDF input
# (512, 256, 128)
# (8, 4, 2)
model.add(Dense(16))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.4))
model.add(Dense(16))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.2))
model.add(Dense(8))
model.add(BatchNormalization())
model.add(Dense(3, activation="softmax"))

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(),
    metrics=["accuracy"],
)

X = np.array(neural_input.get_dense_X_train())
y = np.array(neural_input.y_train)

model.fit(X, y, epochs=100, batch_size=64, verbose=1)

# Evaluate the model
# model.evaluate(neural_input.X_test, neural_input.y_test)
y_pred = model.predict(
    np.array(neural_input.get_dense_X_test()),
)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(neural_input.y_test, y_pred_classes)
print(f"Accuracy with filtered tweets {accuracy:.4f}")

print(
    classification_report(
        neural_input.y_test,
        y_pred_classes,
        target_names=["positive", "neutral", "negative"],
    )
)
print("\n" + "=" * 50 + "\n")

In [None]:
loss, accuracy = model.evaluate(X_test_dense, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

In [None]:
def predict_sentiment(tweet: str):
    # Subword tokenization
    subwords = tokenizer.encode(tweet).tokens
    subwords = ' '.join(subwords)  
    
    # TF-IDF vectorization
    vector = vectorizer_wp.transform([subwords]).toarray()
    
    # Predict
    probs = model.predict(vector)[0]  # Returns probabilities
    
    # Get the class label
    predicted_class = np.argmax(probs)  # For multi-class softmax output
    label_mapping = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    
    return list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class)], probs

curr_df = ns_dataset.get(Languages.HAUSA).test
curr_df = encode_labels(curr_df)


correct_count = 0
total_count = len(curr_df)
label_mapping = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }

for index, row in curr_df.iterrows():
    print(f"Index: {index}")
    print(f"Label: {row}")
    tweet = row['tweet']
    
    sentiment, probabilities = predict_sentiment(tweet)
    print(f"Tweet: {tweet}\nPredicted Sentiment: {sentiment}, Probabilities: {probabilities}\n")
    print(f"Actual Sentiment: {row['label']}\n")

    if sentiment.lower() == row:
        correct_count += 1

accuracy = correct_count / total_count
print(f"Accuracy: {accuracy * 100:.2f}%")


In [None]:
# Get the number of features from your TF-IDF matrix
# tfidf_features = X_train_tfidf.shape[1]  # Number of TF-IDF features
# num_classes = len(np.unique(y_train))    # Number of sentiment classes

# Initialize models


logistic_regression_model = BasicModelEncapsulator(LogisticRegression(max_iter=1000), name="Logistic Regression")
naive_bayes_model = BasicModelEncapsulator(MultinomialNB(), name="Naive Bayes")

tfidf_features = X_train_tfidf.shape[1]  # Number of TF-IDF features
num_classes = len(np.unique(y_train))    # Number of classes


# Initialize the corrected neural network
neural_network_model = NeuralNetworkModel(
    input_dim=tfidf_features, 
    num_classes=num_classes,
    name="Neural Network",
)

wordpiece_neural_network_model = NeuralNetworkModel(
    input_dim=wp_tfidf_features, 
    num_classes=wp_num_classes
)

# accuracy_nn_count, report_nn_count = wordpiece_neural_network_model.perform_pipeline(tfidf_wp_train, wp_y_train)
# print("Neural Network with Count Vectorizer Accuracy:", accuracy_nn_count)
# print("Neural Network with Count Vectorizer Classification Report:\n", report_nn_count)


# Perform pipelines
print("Training models...")

In [None]:
# Evaluator class
# Map integer predictions back to string labels

model = wordpiece_neural_network_model.model

label_mapping = {0: "neutral", 1: "positive", 2: "negative"}


model.fit(tfidf_wp_train, wp_y_train)
predictions = wordpiece_neural_network_model.predict(tfidf_wp_test)
y_test_str = [label_mapping[label] for label in wp_y_test]
predictions_str = [label_mapping[label] for label in predictions]

accuracy = accuracy_score(y_test_str, predictions_str)
report = classification_report(y_test_str, predictions_str, output_dict=True)
from evaluator import Evaluator
# evaluator = Evaluator(
#     {  "Logistic_regression" : logistic_regression_model,
#        "Naive_Bayes" : naive_bayes_model,
#         "Neural_Network" :neural_network_model
#     }
#     )

# results, timings = evaluator.evaluate(X_train_tfidf, y_train)



# evaluator.compare_classification_reports(reports=results, timings=timings)


In [None]:
# wp_evaluator = Evaluator(
#     {  "Logistic_regression" : logistic_regression_model,
#        "Naive_Bayes" : naive_bayes_model,
#         "Wordpiece_Neural_Network" : wordpiece_neural_network_model
#     }
#     )

# results, timings = wp_evaluator.evaluate(tfidf_wp_train, wp_y_train)



# wp_evaluator.compare_classification_reports(reports=results, timings=timings)

In [None]:
# Logistic Regression with TF-IDF
accuracy_lr, report_lr = logistic_regression_model.perform_pipeline(X_train_tfidf, y_train)
print("Logistic Regression Accuracy:", accuracy_lr)
print("Logistic Regression Classification Report:\n", report_lr)

# Logistic Regression with Count Vectorizer
X_train_count, vectorizer_count = initialise_count_vectorizer(text_train)
accuracy_lr_count, report_lr_count = logistic_regression_model.perform_pipeline(X_train_count, y_train)
print("Logistic Regression with Count Vectorizer Accuracy:", accuracy_lr_count)
print("Logistic Regression with Count Vectorizer Classification Report:\n", report_lr_count)

In [None]:
# Naive Bayes with TF-IDF
accuracy_nb, report_nb = naive_bayes_model.perform_pipeline(X_train_tfidf, y_train)
print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", report_nb)

# Naive Bayes with Count Vectorizer
X_train_count, vectorizer_count = initialise_count_vectorizer(text_train)
accuracy_nb_count, report_nb_count = naive_bayes_model.perform_pipeline(X_train_count, y_train)
print("Naive Bayes with Count Vectorizer Accuracy:", accuracy_nb_count)
print("Naive Bayes with Count Vectorizer Classification Report:\n", report_nb_count)

In [None]:
# Neural Network with TF-IDF
ohe_labels = [0 if label == 'neutral' else 1 if label == 'positive' else 2 for label in y_train]

accuracy_nn, report_nn = neural_network_model.perform_pipeline(X_train_tfidf, y_train)
print("Neural Network Accuracy:", accuracy_nn)
print("Neural Network Classification Report:\n", report_nn)

# Neural Network with Count Vectorizer
accuracy_nn_count, report_nn_count = neural_network_model.perform_pipeline(X_train_count, y_train)
print("Neural Network with Count Vectorizer Accuracy:", accuracy_nn_count)
print("Neural Network with Count Vectorizer Classification Report:\n", report_nn_count)


In [None]:
# Wordpiece tokenized models TFIDF

# from subword_tokenizer import get_tokenizer, wordpiece_tokenize_dataframe

# tokenizer = get_tokenizer(df=ns_dataset.get(Languages.HAUSA).train)

# train_df = ns_dataset.get(Languages.HAUSA).train
# test_df = ns_dataset.get(Languages.HAUSA).test
# # Naive Bayes with wordpiece tokenized data
# wp_train_df = wordpiece_tokenize_dataframe(train_df, tokenizer)
# wp_test_df = wordpiece_tokenize_dataframe(test_df, tokenizer)

# wp_X_train_list = wp_train_df['tokenized_tweets'].tolist()
# wp_X_test_list = wp_test_df['tokenized_tweets'].tolist()

# # join sub lists into strings
# wp_X_train_list = [' '.join(tokens) for tokens in wp_X_train_list]
# wp_X_test_list = [' '.join(tokens) for tokens in wp_X_test_list]
# # Convert labels to numerical format (0 for neutral, 1 for positive, 2 for negative)
# wp_train_df['label'] = wp_train_df['label'].apply(lambda x: 0 if x == 'neutral' else 1 if x == 'positive' else 2)
# wp_test_df['label'] = wp_test_df['label'].apply(lambda x: 0 if x == 'neutral' else 1 if x == 'positive' else 2)

# wp_y_train = wp_train_df['label'].tolist()
# wp_y_test = wp_test_df['label'].tolist()

# tfidf_wp_train, vectorizer_wp = initialise_tfidf_vectorizer(wp_X_train_list)
# tfidf_wp_test, _ = initialise_tfidf_vectorizer(wp_X_test_list)

# tfidf_features = tfidf_wp_train.shape[1]  # Number of TF-IDF features
# num_classes = len(np.unique(wp_y_train))    # Number of classes


# Initialize the corrected neural network
neural_network_model = NeuralNetworkModel(
    input_dim=tfidf_features, 
    num_classes=num_classes
)


# Naive Bayes with WordPiece tokenized data
accuracy_nb, report_nb = naive_bayes_model.perform_pipeline(tfidf_wp_train, wp_y_train)
print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", report_nb)

# Logistic Regression with WordPiece tokenized data
accuracy_lr_wp, report_lr_wp = logistic_regression_model.perform_pipeline(tfidf_wp_train, wp_y_train)
print("Logistic Regression Accuracy:", accuracy_lr_wp)
print("Logistic Regression Classification Report:\n", report_lr_wp)

# Neural Network with WordPiece tokenized data
accuracy_nn_wp, report_nn_wp = neural_network_model.perform_pipeline(tfidf_wp_train, wp_y_train)
print("Neural Network Accuracy:", accuracy_nn_wp)
print("Neural Network Classification Report:\n", report_nn_wp)

In [None]:
# Method to optimize n-grams and max features for TF-IDF
def tfidf_score(input_x, y_train, score = None):
    clf = LogisticRegression(max_iter=1000)
    return cross_val_score(clf, X=input_x, y=y_train, scoring=score)
scores_tfidf = tfidf_score(X_train_tfidf, y_train)
print("5-fold Cross-Validation Accuracy for TFIDF: %0.2f (+/- %0.2f)" % (scores_tfidf.mean(), scores_tfidf.std() * 2))

scores_tfidf_f1 = tfidf_score(X_train_tfidf, y_train, score= 'f1_macro')

print("5-fold Cross-Validation F1 score for TFIDF: %0.2f (+/- %0.2f)" % (scores_tfidf_f1.mean(), scores_tfidf_f1.std() * 2))

def test_param_combos(X_train, y_train, param_combos):
    results = []
    for params in param_combos:
        X_train_tfidf, vectorizer_tfidf = initialise_tfidf_vectorizer(X_train, ngram=params.get('ngram'), max_features=params.get('max_features'))
        score = tfidf_score(X_train_tfidf, y_train)
        results.append({
            'ngram': params.get('ngram'),
            'max_features': params.get('max_features'),
            'score': score.mean(),
            'std_dev': score.std(),
        })
    
    return pd.DataFrame(results)

# Example parameter combinations to test
param_combos = [
    {'ngram': (1,2), 'max_features': 5000},
    {'ngram': (1,3), 'max_features': 5000},
    {'ngram': (1,2), 'max_features': 10000},
    {'ngram': (1,3), 'max_features': 10000},
    {'ngram': (1,2), 'max_features': None},
    {'ngram': (1,3), 'max_features': None},
    {'ngram': (1,2), 'max_features': 2000},
    {'ngram': (1,3), 'max_features': 2000},
    {'ngram': (1,2), 'max_features': 3000},
    {'ngram': (1,3), 'max_features': 3000},
    {'ngram': (1,2), 'max_features': 4000},
    {'ngram': (1,3), 'max_features': 4000},
    {'ngram': (1,2), 'max_features': 6000},
    {'ngram': (1,3), 'max_features': 6000},
    {'ngram': (1,2), 'max_features': 7000},
    {'ngram': (1,3), 'max_features': 7000},
    {'ngram': (1,2), 'max_features': 8000},
    {'ngram': (1,3), 'max_features': 8000},
    {'ngram': (1,2), 'max_features': 9000},
    {'ngram': (1,3), 'max_features': 9000},
    {'ngram': (1,2), 'max_features': 10000},
    {'ngram': (1,3), 'max_features': 10000},
    {'ngram': (1,2), 'max_features': 12000},
    {'ngram': (1,4), 'max_features': 5000},
    {'ngram': (1,4), 'max_features': 10000},
    {'ngram': (1,4), 'max_features': None},
    {'ngram': (1,4), 'max_features': 2000},
    {'ngram': (1,4), 'max_features': 3000},
    {'ngram': (1,4), 'max_features': 4000},
    {'ngram': (1,4), 'max_features': 6000},
    {'ngram': (1,4), 'max_features': 7000},
    {'ngram': (1,4), 'max_features': 8000},
    {'ngram': (1,4), 'max_features': 9000},
    {'ngram': (1,4), 'max_features': 10000},
    {'ngram': (1,4), 'max_features': 12000},
    {'ngram': (2,5), 'max_features': 5000},
    {'ngram': (2,5), 'max_features': 10000},
    {'ngram': (2,5), 'max_features': None},
    {'ngram': (2,5), 'max_features': 2000},
    {'ngram': (2,5), 'max_features': 3000},
    {'ngram': (2,5), 'max_features': 4000},
    {'ngram': (2,5), 'max_features': 6000},
    {'ngram': (2,5), 'max_features': 7000},
    {'ngram': (2,5), 'max_features': 8000},
    {'ngram': (2,5), 'max_features': 9000},
    {'ngram': (2,5), 'max_features': 10000},
    {'ngram': (2,5), 'max_features': 12000},
    {'ngram': (3,5), 'max_features': 5000},
    {'ngram': (3,5), 'max_features': 10000},
    {'ngram': (3,5), 'max_features': None},
    {'ngram': (3,5), 'max_features': 2000},
    {'ngram': (3,5), 'max_features': 3000},
    {'ngram': (3,5), 'max_features': 4000},
    {'ngram': (3,5), 'max_features': 6000},
    {'ngram': (3,5), 'max_features': 7000},
    {'ngram': (3,5), 'max_features': 8000},
    {'ngram': (3,5), 'max_features': 9000},
    {'ngram': (3,5), 'max_features': 10000},
    {'ngram': (3,5), 'max_features': 12000}
]
# Test the parameter combinations
results_df = test_param_combos(text_train, y_train, param_combos)
# Sort the results by mean score
results_df = results_df.sort_values(by='score', ascending=False)
# Save the results to a CSV file
results_df.to_csv('data/tfidf_param_combos_results.csv', index=False)
# Print the top results
print("Top parameter combinations based on accuracy:")
print(results_df.head(10))
# Print the results DataFrame




In [None]:
# Plot playground (based off neural network training history)
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 4))

# plt.subplot(1, 2, 1)
# plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.title('Model Accuracy')
# plt.xlabel('Epoch')
# plt.ylabel('Accuracy')
# plt.legend()

# plt.subplot(1, 2, 2)
# plt.plot(history.history['loss'], label='Training Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.title('Model Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.legend()

# plt.tight_layout()
# plt.show()