In [None]:
import os
import pandas as pd
import subprocess
from typing import Dict, List, Optional


In [None]:
REPO_URL_NS = "https://github.com/hausanlp/NaijaSenti.git"
LOCAL_DIR_NS = "NaijaSenti"

REPO_URL_AS = "https://github.com/afrisenti-semeval/afrisent-semeval-2023.git"
LOCAL_DIR_AS = "afrisent-semeval-2023"

def clone_repo(repo_url: str, local_dir: str) -> None:
    if os.path.isdir(local_dir):
        print("Repository exists. Updating...")
        subprocess.run(["git", "-C", local_dir, "pull", "origin", "main"], check=True)
    else:
        print("Repository not found. Cloning...")
        subprocess.run(["git", "clone", repo_url], check=True)

clone_repo(REPO_URL_NS, LOCAL_DIR_NS)
clone_repo(REPO_URL_AS, LOCAL_DIR_AS)


In [None]:
class SplitSet:
    """
    Holds the train, test, dev splits and stopwords for a single language.
    """
    def __init__(self,
                 train: pd.DataFrame,
                 test: pd.DataFrame,
                 dev: pd.DataFrame,
                 stopwords: Optional[List[str]] = None):
        self.train = train
        self.test = test
        self.dev = dev
        self.stopwords = stopwords if stopwords else []

    def summary(self):
        return {
            "train_size": len(self.train),
            "test_size": len(self.test),
            "dev_size": len(self.dev),
            "num_stopwords": len(self.stopwords),
        }


class MultiLangDataset:
    """
    Manages NLP datasets split by language. Each language contains train/test/dev and stopwords.
    """
    def __init__(self):
        self.languages: Dict[str, SplitSet] = {}

    def add_language(self, lang_code: str, split_set: SplitSet):
        self.languages[lang_code] = split_set

    def get(self, lang_code: str) -> Optional[SplitSet]:
        return self.languages.get(lang_code)

    def summary(self) -> Dict[str, Dict[str, int]]:
        return {lang: split.summary() for lang, split in self.languages.items()}

    def all_languages(self) -> List[str]:
        return list(self.languages.keys())

In [None]:
ns_languages = ['hau', 'ibo', 'pcm', 'yor']
class Languages:
    """
    Contains the language codes for NaijaSenti dataset.
    """
    HAUSA = 'hau'
    IGBO = 'ibo'
    NIGERIAN_PIDGIN = 'pcm'
    YORUBA  = 'yor'

In [None]:
def load_local_datasets(local_base_dir, languages=ns_languages, splits=['dev','test','train']):
    dataset = MultiLangDataset()
    
    for lang in languages:
        split_data = {}
        for split in splits:
            path = os.path.join(local_base_dir, lang, f"{split}.tsv")
            try:
                df = pd.read_csv(path, sep='\t', encoding='utf-8')
                # dataset[lang][split] = df
                # dataset.add_language(lang, df)
                split_data[split] = df
            except Exception as e:
                print(f"Failed to load {path}: {e}")

        # Read in stopwords
        if local_base_dir.startswith(LOCAL_DIR_NS):
            path = os.path.join(f'{LOCAL_DIR_NS}/data/stopwords/{lang}.csv')
            try:
                stopwords_df = pd.read_csv(path, encoding='utf-8')
                split_data['stopwords'] = stopwords_df['word'].tolist()
            except Exception as e:
                print(f"Failed to load stopwords for {lang} from {path}: {e}")

        split_set = SplitSet(
            train=split_data.get('train', pd.DataFrame()),
            test=split_data.get('test', pd.DataFrame()),
            dev=split_data.get('dev', pd.DataFrame()),
            stopwords=split_data.get('stopwords', [])
        )
        dataset.add_language(lang, split_set)
    return dataset

In [None]:
ns_dataset: MultiLangDataset = load_local_datasets(local_base_dir=LOCAL_DIR_NS + '/data/annotated_tweets', languages=ns_languages) 

In [None]:
as_dataset: MultiLangDataset = load_local_datasets(local_base_dir=f'afrisent-semeval-2023/data', languages=ns_languages,)

In [None]:
print("NaijaSenti dataset loaded with languages:", ns_dataset.all_languages())
print("Afrisenti dataset loaded with languages:", as_dataset.all_languages())

In [None]:
print("NaijaSenti hau: ", ns_dataset.get(Languages.HAUSA).dev)
# Print each row in the dev set for the column 'tweet'
for index, row in ns_dataset.get(Languages.HAUSA).dev.iterrows():
    print(f"Index: {index}, Tweet: {row['tweet']}")

# write all the tweets into a textfile
# check if the dir data exists, if not create it
if not os.path.exists('data'):
    os.makedirs('data')
with open('data/naija_senti_hau_dev_tweets.txt', 'w', encoding='utf-8') as f:
    for index, row in ns_dataset.get(Languages.HAUSA).dev.iterrows():
        f.write(f"{row['tweet']}\n")

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

trainer = WordPieceTrainer(vocab_size=8000, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
tokenizer.train(["data/naija_senti_hau_dev_tweets.txt"], trainer)
tokenizer.save("data/wordpiece.json")

with open('data/naija_senti_hau_dev_tweets_tokenized.txt', 'w', encoding='utf-8') as f:
    for index, row in ns_dataset.get(Languages.HAUSA).dev.iterrows():
        tokens = tokenizer.encode(row['tweet']).tokens
        f.write(" ".join(tokens) + "\n")

# adjust the below to read into a list of strings

# def read_tokenized_file(file_path: str) -> List[List[str]]:
#     """
#     Reads a tokenized file and returns a list of token lists.
#     """
#     with open(file_path, 'r', encoding='utf-8') as f:
#         return [line.strip().split() for line in f.readlines()]
    
def read_tokenized_file(file_path: str) -> List[str]:
    """
    Reads a tokenized file and returns a list of strings.
    Each string is a space-separated sequence of tokens.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f.readlines()]


In [None]:
from collections import Counter

def compute_tf(texts):
    tf_list = []
    for text in texts:
        counts = Counter(text)
        total = len(text)
        tf = {subword: count / total for subword, count in counts.items()}
        tf_list.append(tf)
    return tf_list

In [None]:
import math
from collections import defaultdict

def compute_idf(texts):
    N = len(texts)
    df = defaultdict(int)
    for text in texts:
        unique = set(text)
        for subword in unique:
            df[subword] += 1
    idf = {subword: math.log(N / (1 + df_val)) for subword, df_val in df.items()}
    return idf

In [None]:
def compute_tfidf(tf_list, idf):
    tfidf_list = []
    for tf in tf_list:
        tfidf = {subword: tf_val * idf.get(subword, 0.0) for subword, tf_val in tf.items()}
        tfidf_list.append(tfidf)
    return tfidf_list

In [None]:
tokenized_data = read_tokenized_file('data/naija_senti_hau_dev_tweets_tokenized.txt')
tf_list = compute_tf(tokenized_data)
idf = compute_idf(tokenized_data)
tfidf_list = compute_tfidf(tf_list, idf)

# Save the TF-IDF results to a file
with open('data/naija_senti_hau_dev_tweets_tfidf.txt', 'w', encoding='utf-8') as f:
    for i, tfidf in enumerate(tfidf_list):
        f.write(f"Tweet {i}:\n")
        for subword, score in tfidf.items():
            f.write(f"{subword}: {score}\n")
        f.write("\n")

In [None]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("data/wordpiece.json")
def wordpiece_tokenizer(text):
    return tokenizer.encode(text).tokens

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    tokenizer=wordpiece_tokenizer,
    lowercase=False,    
    preprocessor=None,
    token_pattern=None       
)

tweets = read_tokenized_file('data/naija_senti_hau_dev_tweets_tokenized.txt')

tfidf_matrix = vectorizer.fit_transform(tweets)

# display the first 5 rows of the TF-IDF matrix
import pandas as pd

df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df.to_csv("tfidf_matrix.csv", index=False)
