In [None]:
import os
import pandas as pd
import subprocess
from typing import TypedDict


In [None]:
REPO_URL_NS = "https://github.com/hausanlp/NaijaSenti.git"
LOCAL_DIR_NS = "NaijaSenti"

REPO_URL_AS = "https://github.com/afrisenti-semeval/afrisent-semeval-2023.git"
LOCAL_DIR_AS = "AfriSenti"

def clone_repo(repo_url: str, local_dir: str) -> None:
    if os.path.isdir(local_dir):
        print("Repository exists. Updating...")
        subprocess.run(["git", "-C", local_dir, "pull", "origin", "main"], check=True)
    else:
        print("Repository not found. Cloning...")
        subprocess.run(["git", "clone", repo_url], check=True)

clone_repo(REPO_URL_NS, LOCAL_DIR_NS)
clone_repo(REPO_URL_AS, LOCAL_DIR_AS)


Repository exists. Updating...
Repository not found. Cloning...


In [None]:
class NsSplit(TypedDict):
    dev: pd.DataFrame
    test: pd.DataFrame
    train: pd.DataFrame
    stopwords: list[str]

class NsDataset(TypedDict):
    hau: NsSplit
    ibo: NsSplit
    pcm: NsSplit
    yor: NsSplit

In [None]:
ns_languages = ['hau', 'ibo', 'pcm', 'yor']
ns_languages = [lang for lang in ns_languages if os.path.isdir(os.path.join(f'{LOCAL_DIR_NS}/data/annotated_tweets', lang))]

In [None]:
def load_local_datasets(local_base_dir, languages=ns_languages, splits=['dev','test','train']):
    dataset = {}
    for lang in languages:
        dataset[lang] = {}
        for split in splits:
            path = os.path.join(local_base_dir, lang, f"{split}.tsv")
            try:
                df = pd.read_csv(path, sep='\t', encoding='utf-8')
                dataset[lang][split] = df
            except Exception as e:
                print(f"Failed to load {path}: {e}")

        # Read in stopwords
        path = os.path.join(f'{local_base_dir}/stopwords', f"{lang}.csv")
        try:
            stopwords_df = pd.read_csv(path, encoding='utf-8')
            dataset[lang]['stopwords'] = stopwords_df['word'].tolist()
        except Exception as e:
            print(f"Failed to load stopwords for {lang} from {path}: {e}")
    return dataset




In [39]:
ns_dataset : NsDataset = load_local_datasets(local_base_dir=LOCAL_DIR_NS + '/data/annotated_tweets', languages=ns_languages) 

Failed to load stopwords for hau from NaijaSenti/data/annotated_tweets/stopwords\hau.csv: [Errno 2] No such file or directory: 'NaijaSenti/data/annotated_tweets/stopwords\\hau.csv'
Failed to load stopwords for ibo from NaijaSenti/data/annotated_tweets/stopwords\ibo.csv: [Errno 2] No such file or directory: 'NaijaSenti/data/annotated_tweets/stopwords\\ibo.csv'
Failed to load stopwords for pcm from NaijaSenti/data/annotated_tweets/stopwords\pcm.csv: [Errno 2] No such file or directory: 'NaijaSenti/data/annotated_tweets/stopwords\\pcm.csv'
Failed to load stopwords for yor from NaijaSenti/data/annotated_tweets/stopwords\yor.csv: [Errno 2] No such file or directory: 'NaijaSenti/data/annotated_tweets/stopwords\\yor.csv'


In [None]:
as_dataset = load_local_datasets(local_base_dir=f'afrisent-semeval-2023/data', languages=ns_languages,)

Failed to load AfriSenti/data\hau\dev.tsv: [Errno 2] No such file or directory: 'AfriSenti/data\\hau\\dev.tsv'
Failed to load AfriSenti/data\hau\test.tsv: [Errno 2] No such file or directory: 'AfriSenti/data\\hau\\test.tsv'
Failed to load AfriSenti/data\hau\train.tsv: [Errno 2] No such file or directory: 'AfriSenti/data\\hau\\train.tsv'
Failed to load stopwords for hau from AfriSenti/data/stopwords\hau.csv: [Errno 2] No such file or directory: 'AfriSenti/data/stopwords\\hau.csv'
Failed to load AfriSenti/data\ibo\dev.tsv: [Errno 2] No such file or directory: 'AfriSenti/data\\ibo\\dev.tsv'
Failed to load AfriSenti/data\ibo\test.tsv: [Errno 2] No such file or directory: 'AfriSenti/data\\ibo\\test.tsv'
Failed to load AfriSenti/data\ibo\train.tsv: [Errno 2] No such file or directory: 'AfriSenti/data\\ibo\\train.tsv'
Failed to load stopwords for ibo from AfriSenti/data/stopwords\ibo.csv: [Errno 2] No such file or directory: 'AfriSenti/data/stopwords\\ibo.csv'
Failed to load AfriSenti/data\pc