In [None]:
import os
import pandas as pd
import subprocess
from typing import Dict, List, Optional
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from tokenizers import Tokenizer as WPTokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout,Input, BatchNormalization, Activation
from tensorflow.keras.utils import to_categorical


In [None]:
REPO_URL_NS = "https://github.com/hausanlp/NaijaSenti.git"
LOCAL_DIR_NS = "NaijaSenti"

REPO_URL_AS = "https://github.com/afrisenti-semeval/afrisent-semeval-2023.git"
LOCAL_DIR_AS = "afrisent-semeval-2023"

def clone_repo(repo_url: str, local_dir: str) -> None:
    if os.path.isdir(local_dir):
        print("Repository exists. Updating...")
        subprocess.run(["git", "-C", local_dir, "pull", "origin", "main"], check=True)
    else:
        print("Repository not found. Cloning...")
        subprocess.run(["git", "clone", repo_url], check=True)

clone_repo(REPO_URL_NS, LOCAL_DIR_NS)
clone_repo(REPO_URL_AS, LOCAL_DIR_AS)


In [None]:
class SplitSet:
    """
    Holds the train, test, dev splits and stopwords for a single language.
    """
    def __init__(self,
                 train: pd.DataFrame,
                 test: pd.DataFrame,
                 dev: pd.DataFrame,
                 stopwords: Optional[List[str]] = None):
        self.train = train
        self.test = test
        self.dev = dev
        self.stopwords = stopwords if stopwords else []

    def summary(self):
        return {
            "train_size": len(self.train),
            "test_size": len(self.test),
            "dev_size": len(self.dev),
            "num_stopwords": len(self.stopwords),
        }


class MultiLangDataset:
    """
    Manages NLP datasets split by language. Each language contains train/test/dev and stopwords.
    """
    def __init__(self):
        self.languages: Dict[str, SplitSet] = {}

    def add_language(self, lang_code: str, split_set: SplitSet):
        self.languages[lang_code] = split_set

    def get(self, lang_code: str) -> Optional[SplitSet]:
        return self.languages.get(lang_code)

    def summary(self) -> Dict[str, Dict[str, int]]:
        return {lang: split.summary() for lang, split in self.languages.items()}

    def all_languages(self) -> List[str]:
        return list(self.languages.keys())

In [None]:
ns_languages = ['hau', 'ibo', 'pcm', 'yor']
class Languages:
    """
    Contains the language codes for NaijaSenti dataset.
    """
    HAUSA = 'hau'
    IGBO = 'ibo'
    NIGERIAN_PIDGIN = 'pcm'
    YORUBA  = 'yor'

In [None]:
def load_local_datasets(local_base_dir, languages=ns_languages, splits=['dev','test','train']):
    dataset = MultiLangDataset()
    
    for lang in languages:
        split_data = {}
        for split in splits:
            path = os.path.join(local_base_dir, lang, f"{split}.tsv")
            try:
                df = pd.read_csv(path, sep='\t', encoding='utf-8')
                # dataset[lang][split] = df
                # dataset.add_language(lang, df)
                split_data[split] = df
            except Exception as e:
                print(f"Failed to load {path}: {e}")

        # Read in stopwords
        if local_base_dir.startswith(LOCAL_DIR_NS):
            path = os.path.join(f'{LOCAL_DIR_NS}/data/stopwords/{lang}.csv')
            try:
                stopwords_df = pd.read_csv(path, encoding='utf-8')
                split_data['stopwords'] = stopwords_df['word'].tolist()
            except Exception as e:
                print(f"Failed to load stopwords for {lang} from {path}: {e}")

        split_set = SplitSet(
            train=split_data.get('train', pd.DataFrame()),
            test=split_data.get('test', pd.DataFrame()),
            dev=split_data.get('dev', pd.DataFrame()),
            stopwords=split_data.get('stopwords', [])
        )
        dataset.add_language(lang, split_set)
    return dataset

In [None]:
ns_dataset: MultiLangDataset = load_local_datasets(local_base_dir=LOCAL_DIR_NS + '/data/annotated_tweets', languages=ns_languages) 

In [None]:
as_dataset: MultiLangDataset = load_local_datasets(local_base_dir=f'afrisent-semeval-2023/data', languages=ns_languages,)

In [None]:
print("NaijaSenti dataset loaded with languages:", ns_dataset.all_languages())
print("Afrisenti dataset loaded with languages:", as_dataset.all_languages())

In [None]:
print("NaijaSenti hau: ", ns_dataset.get(Languages.HAUSA).test)
# Print each row in the dev set for the column 'tweet'
for index, row in ns_dataset.get(Languages.HAUSA).test.iterrows():
    print(f"Index: {index}, Tweet: {row['tweet']}")

# write all the tweets into a textfile
# check if the dir data exists, if not create it
if not os.path.exists('data'):
    os.makedirs('data')
with open('data/naija_senti_hau_dev_tweets.txt', 'w', encoding='utf-8') as f:
    for index, row in ns_dataset.get(Languages.HAUSA).dev.iterrows():
        f.write(f"{row['tweet']}\n")

In [None]:


tokenizer = WPTokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(vocab_size=8000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
df = ns_dataset.get(Languages.HAUSA).dev
# Collect all tweets into a single list for training
tweets = df['tweet'].tolist()
tokenizer.train_from_iterator(tweets, trainer)
tokenizer.save("data/wordpiece.json")

# Method 3: Add both tokens and IDs
df['tokenized_tweets'] = df['tweet'].apply(lambda x: tokenizer.encode(x).tokens)
df['token_ids'] = df['tweet'].apply(lambda x: tokenizer.encode(x).ids)

# Display results
print("Original vs Tokenized:")
print("=" * 80)
for i in range(min(5, len(df))):  # Show first 5 examples
    print(f"Original: {df.iloc[i]['tweet']}")
    print(f"Tokens:   {df.iloc[i]['tokenized_tweets']}")
    print(f"IDs:      {df.iloc[i]['token_ids']}")
    print("-" * 80)

with open('data/naija_senti_hau_dev_tweets_tokenized.txt', 'w', encoding='utf-8') as f:
    for index, row in ns_dataset.get(Languages.HAUSA).dev.iterrows():
        tokens = tokenizer.encode(row['tweet']).tokens
        f.write(" ".join(tokens) + "\n")

# adjust the below to read into a list of strings

# def read_tokenized_file(file_path: str) -> List[List[str]]:
#     """
#     Reads a tokenized file and returns a list of token lists.
#     """
#     with open(file_path, 'r', encoding='utf-8') as f:
#         return [line.strip().split() for line in f.readlines()]
    
def read_tokenized_file(file_path: str) -> List[str]:
    """
    Reads a tokenized file and returns a list of strings.
    Each string is a space-separated sequence of tokens.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]


In [None]:
import re
def preprocess_tweet(tweet):
    # Remove all words that start with @ (e.g., @user, @someone123)
    return re.sub(r'@\w+', '', tweet).strip()

def wordpiece_tokenize_dataframe(df: pd.DataFrame, tokenizer: Tokenizer) -> pd.DataFrame:
    """
    Tokenizes the 'tweet' column of a DataFrame using a WordPiece tokenizer.
    Adds two new columns: 'tokenized_tweets' and 'token_ids'.
    """
    
    tokenizer = WPTokenizer(WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = Whitespace()

    # trainer = WordPieceTrainer(vocab_size=8000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    trainer = WordPieceTrainer(vocab_size=8000, special_tokens=["[UNK]"])
    tweets = df['tweet'].tolist()
    tweets = [preprocess_tweet(tweet) for tweet in tweets]  # Preprocess tweets
    tokenizer.train_from_iterator(tweets, trainer)
    df['tokenized_tweets'] = df['tweet'].apply(lambda x: tokenizer.encode(x).tokens)
    df['token_ids'] = df['tweet'].apply(lambda x: tokenizer.encode(x).ids)
    return df

In [None]:
class ModelEncapsulator:
    """Encapsulates models with train, fit and predict methods."""
    def __init__(self, model):
        self.model = model

    def predict(self, X):
        """Predict labels for features X."""
        raise NotImplementedError("This method should be implemented by subclasses.")

    def fit(self, X, y):
        """Train the model."""
        raise NotImplementedError("This method should be implemented by subclasses.")

    def perform_pipeline(self, X, y):
        """
        Perform the training and evaluation pipeline.
        Returns accuracy and classification report.
        """
        raise NotImplementedError("This method should be implemented by subclasses.")

class BasicModelEncapsulator(ModelEncapsulator ):
    """Encapsulates models with train, fit and predict methods."""
    def __init__(self, model):
        self.model = model

    def predict(self, X):
        """Predict labels for features X."""
        return self.model.predict(X)

    def fit(self, X, y):
        """Alias for train method."""
        self.model.fit(X, y)

    def perform_pipeline(self, X, y):
        """"""
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
        self.fit(X_train, y_train)
        predictions = self.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        report = classification_report(y_test, predictions)
        return accuracy, report

class NeuralNetworkModel(ModelEncapsulator):
    """Neural network model designed for TF-IDF features."""
    def __init__(self, input_dim: int, num_classes: int = 3):
        self.input_dim = input_dim
        self.num_classes = num_classes
        self.model = self._build_model()

    def _build_model(self):
        """Build neural network for TF-IDF features."""
        model = Sequential()
        # Explicitly define the input shape using an Input layer
        model.add(Input(shape=(self.input_dim,)))  # ✅ Replaces input_dim in Dense

        # Dense layers for TF-IDF input
        model.add(Dense(512))
        model.add(BatchNormalization())
        model.add(Activation('relu'))

        model.add(Dropout(0.4))
        model.add(Dense(256))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(Dropout(0.3))
        model.add(Dense(self.num_classes, activation='softmax'))
        
        model.compile(
            loss='categorical_crossentropy', 
            optimizer=Adam(learning_rate=1e-4),
            metrics=['accuracy']
        )
        return model

    def fit(self, X, y, epochs=10, batch_size=32, validation_split=0.2):
        """Train the neural network."""
        # Convert sparse matrix to dense if needed
        if hasattr(X, 'toarray'):
            X = X.toarray()
        
        # Convert labels to categorical (one-hot encoding)

        y_categorical = to_categorical(y, num_classes=self.num_classes)
        
        history = self.model.fit(
            X, y_categorical, 
            epochs=epochs, 
            batch_size=batch_size,
            validation_split=validation_split,
            verbose=0
        )
        return history

    def predict(self, X):
        """Predict labels for features X."""
        # Convert sparse matrix to dense if needed
        if hasattr(X, 'toarray'):
            X = X.toarray()
        
        # Get predictions and convert back to class labels
        predictions = self.model.predict(X)
        return np.argmax(predictions, axis=-1)
    
    def perform_pipeline(self, X, y, epochs=10, batch_size=32):
        """
        Perform the training and evaluation pipeline.
        Returns accuracy and classification report.
        """
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, random_state=42
        )
        
        self.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)
        
        predictions = self.predict(X_test)
        
        accuracy = accuracy_score(y_test, predictions)
        report = classification_report(y_test, predictions)
        
        return accuracy, report



In [None]:

def initialise_tfidf_vectorizer(data, ngram = None, max_features = None):
    if ngram and max_features:
        vectorizer_tf = TfidfVectorizer( ngram_range=ngram,max_features=max_features)
    elif ngram and max_features is None:
      vectorizer_tf = TfidfVectorizer(ngram_range=ngram)
    elif ngram is None and max_features:
      vectorizer_tf = TfidfVectorizer(max_features=max_features)
    else:
      vectorizer_tf = TfidfVectorizer()
    # vectorizer_tfidf = TfidfVectorizer(ngram_range=ngram,max_features=max_features)
    vectorizer_tf.fit(data)
    X = vectorizer_tf.transform(data)
    return X, vectorizer_tf

def initialise_count_vectorizer(data, ngram = None, max_features = None):
    if ngram and max_features:
        vectorizer_count = CountVectorizer(ngram_range=ngram, max_features=max_features)
    elif ngram and max_features is None:
        vectorizer_count = CountVectorizer(ngram_range=ngram)
    elif ngram is None and max_features:
        vectorizer_count = CountVectorizer(max_features=max_features)
    else:
        vectorizer_count = CountVectorizer()
    
    vectorizer_count.fit(data)
    X = vectorizer_count.transform(data)
    return X, vectorizer_count

In [None]:

df = ns_dataset.get(Languages.HAUSA).train
text_train, text_test, y_train, y_test = train_test_split(df.tweet, df.label, test_size = 0.3)
X_train_tfidf, vectorizer_tfidf = initialise_tfidf_vectorizer(text_train)
X_train_count, vectorizer_count = initialise_count_vectorizer(text_train)

In [None]:
# Neural network model
# 1. Dummy tweets and labels

df = ns_dataset.get(Languages.HAUSA).dev
tweets = df['tweet'].tolist()
labels = df['label'].tolist()

# Convert labels to numerical format (0 for neutral, 1 for positive, 2 for negative) 
# not binary classification
# 0 = neautral, 1 = positive, 2 = negative
labels = [0 if label == 'neutral' else 1 if label == 'positive' else 2 for label in labels]


print("Class distribution:", np.bincount(labels))

# 2. Preprocess text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)

max_length = min(100, int(np.percentile([len(seq) for seq in sequences], 95)))
print(f"Using max_length: {max_length}")
padded = pad_sequences(sequences, maxlen=max_length, padding='post')

# 3. One-hot encode labels (needed for softmax)
labels = to_categorical(labels, num_classes=3)

# 4. Train-test split
# X_temp, X_test, y_temp, y_test = train_test_split(
#     padded, labels, test_size=0.2, random_state=42, stratify=labels
# )
# X_train, X_val, y_train, y_val = train_test_split(
#     X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp  # 0.25 * 0.8 = 0.2 of total
# )

# 5. Build model
model = Sequential([
    Embedding(input_dim=5000, output_dim=32, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),  # Regularization
    Dense(32, activation='relu'),
    Dropout(0.3),  # Regularization
    Dense(3, activation='softmax')
])
# 6. Compile and train
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()
# history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val))

# # 7. Evaluate
# loss, accuracy = model.evaluate(X_test, y_test)
# print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [None]:
# Get the number of features from your TF-IDF matrix
# tfidf_features = X_train_tfidf.shape[1]  # Number of TF-IDF features
# num_classes = len(np.unique(y_train))    # Number of sentiment classes

# Initialize models
logistic_regression_model = BasicModelEncapsulator(LogisticRegression(max_iter=1000))
naive_bayes_model = BasicModelEncapsulator(MultinomialNB())

tfidf_features = X_train_tfidf.shape[1]  # Number of TF-IDF features
num_classes = len(np.unique(y_train))    # Number of classes


# Initialize the corrected neural network
neural_network_model = NeuralNetworkModel(
    input_dim=tfidf_features, 
    num_classes=num_classes
)

# Perform pipelines
print("Training models...")

In [None]:
# Logistic Regression with TF-IDF
accuracy_lr, report_lr = logistic_regression_model.perform_pipeline(X_train_tfidf, y_train)
print("Logistic Regression Accuracy:", accuracy_lr)
print("Logistic Regression Classification Report:\n", report_lr)

# Logistic Regression with Count Vectorizer
X_train_count, vectorizer_count = initialise_count_vectorizer(text_train)
accuracy_lr_count, report_lr_count = logistic_regression_model.perform_pipeline(X_train_count, y_train)
print("Logistic Regression with Count Vectorizer Accuracy:", accuracy_lr_count)
print("Logistic Regression with Count Vectorizer Classification Report:\n", report_lr_count)

In [None]:
# Naive Bayes with TF-IDF
accuracy_nb, report_nb = naive_bayes_model.perform_pipeline(X_train_tfidf, y_train)
print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", report_nb)

# Naive Bayes with Count Vectorizer
X_train_count, vectorizer_count = initialise_count_vectorizer(text_train)
accuracy_nb_count, report_nb_count = naive_bayes_model.perform_pipeline(X_train_count, y_train)
print("Naive Bayes with Count Vectorizer Accuracy:", accuracy_nb_count)
print("Naive Bayes with Count Vectorizer Classification Report:\n", report_nb_count)

In [None]:
# Neural Network with TF-IDF
ohe_labels = [0 if label == 'neutral' else 1 if label == 'positive' else 2 for label in y_train]

accuracy_nn, report_nn = neural_network_model.perform_pipeline(X_train_tfidf, ohe_labels)
print("Neural Network Accuracy:", accuracy_nn)
print("Neural Network Classification Report:\n", report_nn)

# Neural Network with Count Vectorizer
accuracy_nn_count, report_nn_count = neural_network_model.perform_pipeline(X_train_count, ohe_labels)
print("Neural Network with Count Vectorizer Accuracy:", accuracy_nn_count)
print("Neural Network with Count Vectorizer Classification Report:\n", report_nn_count)


In [None]:
# Wordpiece tokenized models TFIDF

train_df = ns_dataset.get(Languages.HAUSA).train
test_df = ns_dataset.get(Languages.HAUSA).test
# Naive Bayes with wordpiece tokenized data
wp_train_df = wordpiece_tokenize_dataframe(train_df, tokenizer)
wp_test_df = wordpiece_tokenize_dataframe(test_df, tokenizer)

wp_X_train_list = wp_train_df['tokenized_tweets'].tolist()
wp_X_test_list = wp_test_df['tokenized_tweets'].tolist()

# join sub lists into strings
wp_X_train_list = [' '.join(tokens) for tokens in wp_X_train_list]
wp_X_test_list = [' '.join(tokens) for tokens in wp_X_test_list]
# Convert labels to numerical format (0 for neutral, 1 for positive, 2 for negative)
wp_train_df['label'] = wp_train_df['label'].apply(lambda x: 0 if x == 'neutral' else 1 if x == 'positive' else 2)
wp_test_df['label'] = wp_test_df['label'].apply(lambda x: 0 if x == 'neutral' else 1 if x == 'positive' else 2)

wp_y_train = wp_train_df['label'].tolist()
wp_y_test = wp_test_df['label'].tolist()

tfidf_wp_train, vectorizer_wp = initialise_tfidf_vectorizer(wp_X_train_list)
tfidf_wp_test, _ = initialise_tfidf_vectorizer(wp_X_test_list)

tfidf_features = tfidf_wp_train.shape[1]  # Number of TF-IDF features
num_classes = len(np.unique(wp_y_train))    # Number of classes


# Initialize the corrected neural network
neural_network_model = NeuralNetworkModel(
    input_dim=tfidf_features, 
    num_classes=num_classes
)


# Naive Bayes with WordPiece tokenized data
accuracy_nb, report_nb = naive_bayes_model.perform_pipeline(tfidf_wp_train, wp_y_train)
print("Naive Bayes Accuracy:", accuracy_nb)
print("Naive Bayes Classification Report:\n", report_nb)

# Logistic Regression with WordPiece tokenized data
accuracy_lr_wp, report_lr_wp = logistic_regression_model.perform_pipeline(tfidf_wp_train, wp_y_train)
print("Logistic Regression Accuracy:", accuracy_lr_wp)
print("Logistic Regression Classification Report:\n", report_lr_wp)

# Neural Network with WordPiece tokenized data
accuracy_nn_wp, report_nn_wp = neural_network_model.perform_pipeline(tfidf_wp_train, wp_y_train)
print("Neural Network Accuracy:", accuracy_nn_wp)
print("Neural Network Classification Report:\n", report_nn_wp)

In [None]:
# Method to optimize n-grams and max features for TF-IDF
def tfidf_score(input_x, y_train, score = None):
    clf = LogisticRegression(max_iter=1000)
    return cross_val_score(clf, X=input_x, y=y_train, scoring=score)
scores_tfidf = tfidf_score(X_train_tfidf, y_train)
print("5-fold Cross-Validation Accuracy for TFIDF: %0.2f (+/- %0.2f)" % (scores_tfidf.mean(), scores_tfidf.std() * 2))

scores_tfidf_f1 = tfidf_score(X_train_tfidf, y_train, score= 'f1_macro')

print("5-fold Cross-Validation F1 score for TFIDF: %0.2f (+/- %0.2f)" % (scores_tfidf_f1.mean(), scores_tfidf_f1.std() * 2))

def test_param_combos(X_train, y_train, param_combos):
    """
    Test different parameter combinations for a model.
    
    :param X_train: Training data features
    :param y_train: Training data labels
    :param param_combos: List of dictionaries with parameter combinations
    :param score: Scoring metric to use
    :return: DataFrame with results
    """
    results = []
    for params in param_combos:
        X_train_tfidf, vectorizer_tfidf = initialise_tfidf_vectorizer(X_train, ngram=params.get('ngram_range'), max_features=params.get('max_features'))
        score = tfidf_score(X_train_tfidf, y_train)
        results.append({
            'ngram_range': params.get('ngram_range'),
            'max_features': params.get('max_features'),
            'score': score.mean(),
            'std_dev': score.std(),
        })
    
    return pd.DataFrame(results)

# Example parameter combinations to test
param_combos = [
    {'ngram_range': (1,2), 'max_features': 5000},
    {'ngram_range': (1,3), 'max_features': 5000},
    {'ngram_range': (1,2), 'max_features': 10000},
    {'ngram_range': (1,3), 'max_features': 10000},
    {'ngram_range': (1,2), 'max_features': None},
    {'ngram_range': (1,3), 'max_features': None},
    {'ngram_range': (1,2), 'max_features': 2000},
    {'ngram_range': (1,3), 'max_features': 2000},
    {'ngram_range': (1,2), 'max_features': 3000},
    {'ngram_range': (1,3), 'max_features': 3000},
    {'ngram_range': (1,2), 'max_features': 4000},
    {'ngram_range': (1,3), 'max_features': 4000},
    {'ngram_range': (1,2), 'max_features': 6000},
    {'ngram_range': (1,3), 'max_features': 6000},
    {'ngram_range': (1,2), 'max_features': 7000},
    {'ngram_range': (1,3), 'max_features': 7000},
    {'ngram_range': (1,2), 'max_features': 8000},
    {'ngram_range': (1,3), 'max_features': 8000},
    {'ngram_range': (1,2), 'max_features': 9000},
    {'ngram_range': (1,3), 'max_features': 9000},
    {'ngram_range': (1,2), 'max_features': 10000},
    {'ngram_range': (1,3), 'max_features': 10000},
    {'ngram_range': (1,2), 'max_features': 12000},
    {'ngram_range': (1,4), 'max_features': 5000},
    {'ngram_range': (1,4), 'max_features': 10000},
    {'ngram_range': (1,4), 'max_features': None},
    {'ngram_range': (1,4), 'max_features': 2000},
    {'ngram_range': (1,4), 'max_features': 3000},
    {'ngram_range': (1,4), 'max_features': 4000},
    {'ngram_range': (1,4), 'max_features': 6000},
    {'ngram_range': (1,4), 'max_features': 7000},
    {'ngram_range': (1,4), 'max_features': 8000},
    {'ngram_range': (1,4), 'max_features': 9000},
    {'ngram_range': (1,4), 'max_features': 10000},
    {'ngram_range': (1,4), 'max_features': 12000},
    {'ngram_range': (2,5), 'max_features': 5000},
    {'ngram_range': (2,5), 'max_features': 10000},
    {'ngram_range': (2,5), 'max_features': None},
    {'ngram_range': (2,5), 'max_features': 2000},
    {'ngram_range': (2,5), 'max_features': 3000},
    {'ngram_range': (2,5), 'max_features': 4000},
    {'ngram_range': (2,5), 'max_features': 6000},
    {'ngram_range': (2,5), 'max_features': 7000},
    {'ngram_range': (2,5), 'max_features': 8000},
    {'ngram_range': (2,5), 'max_features': 9000},
    {'ngram_range': (2,5), 'max_features': 10000},
    {'ngram_range': (2,5), 'max_features': 12000},
    {'ngram_range': (3,5), 'max_features': 5000},
    {'ngram_range': (3,5), 'max_features': 10000},
    {'ngram_range': (3,5), 'max_features': None},
    {'ngram_range': (3,5), 'max_features': 2000},
    {'ngram_range': (3,5), 'max_features': 3000},
    {'ngram_range': (3,5), 'max_features': 4000},
    {'ngram_range': (3,5), 'max_features': 6000},
    {'ngram_range': (3,5), 'max_features': 7000},
    {'ngram_range': (3,5), 'max_features': 8000},
    {'ngram_range': (3,5), 'max_features': 9000},
    {'ngram_range': (3,5), 'max_features': 10000},
    {'ngram_range': (3,5), 'max_features': 12000}
]
# Test the parameter combinations
results_df = test_param_combos(text_train, y_train, param_combos)
# Sort the results by mean score
results_df = results_df.sort_values(by='score', ascending=False)
# Save the results to a CSV file
results_df.to_csv('data/tfidf_param_combos_results.csv', index=False)
# Print the top results
print("Top parameter combinations based on accuracy:")
print(results_df.head(10))
# Print the results DataFrame




In [None]:
# Naive Bayes Classifier


# Build a Gaussian Classifier
model = MultinomialNB()
df = ns_dataset.get(Languages.HAUSA).dev
X = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

vectorizer = CountVectorizer() # No stop words since its a small african language dataset
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model training
model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Bag of words
df = ns_dataset.get(Languages.HAUSA).dev
tweets = df['tweet']
sentiment = df['label']
vectorizer = CountVectorizer(lowercase=True, stop_words='english')  # remove stopwords
X = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, sentiment, test_size = 0.3)

clf = MultinomialNB()
clf.fit(X_train, y_train)

# Step 4: Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# 11. Plot training history (optional)
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()