In [1]:
import torch
import openai

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, arrow_dataset
from sklearn.model_selection import train_test_split
#import jsonlines
import random

SEED = 42
random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# data = load_dataset("castorini/wura", "yor", level="document", verification_mode="no_checks", trust_remote_code=True)
data = load_dataset("castorini/wura", "ibo", level="document", verification_mode="no_checks", trust_remote_code=True)

In [5]:
def prepare_wura(dataset):
    if not isinstance(dataset, arrow_dataset.Dataset):
        raise ValueError(f"The parameter `dataset` only accepts `arrow_dataset.Dataset` objects. Got {type(dataset)} instead.")

    expected_columns = {"headline", "content", "category", "url"}
    missing_columns = expected_columns.difference(set(dataset.features))
    if missing_columns:
        raise ValueError(f"The dataset must contain all of the following features: {expected_columns}. Missing features: {missing_columns}")

    domain_counts = {}
    for row in dataset:
        domain = extract_domain_name(row["url"])
        domain_counts[domain] = domain_counts.get(domain, 0) + 1

    invalid_domains = {
        "jw.org" # Has really weird links, for example:  https://www.jw.org/yo/elerii-jehofa/kan-si-wa/venezuela/, https://www.jw.org/yo/elerii-jehofa/kan-si-wa/tonga/, https://www.jw.org/yo/elerii-jehofa/kan-si-wa/taiwan/ all have the title "Kan Si Wa"
    }

    is_headline_valid = lambda value: len((value or " ").split()) > 1
    is_url_valid = lambda value: len((value or " ").strip()) > 5
    is_domain_valid = lambda value: domain_counts[value] > 10 and not value in invalid_domains # If the domain does not appear enough times that is a sign that the site is not committed to publishing in the language. So it is probably a weird url or the English was translated using Google translate e.g. https://downloadfacetime.com/facetime/facetime-for-ipad/
    is_text_valid = lambda value: len((value or " ").strip().split()) > 30

    data = []
    for row in dataset:
        if not (is_headline_valid(row["headline"]) \
                and is_url_valid(row["url"]) \
                and is_domain_valid(extract_domain_name(row["url"]))):
            continue

        data.append({
            "title": row["headline"],
            "url": row["url"].strip("/") + "/", "text": row["content"],
            "category": row["category"]
        })

    wura_df = pd.DataFrame(data)
    return wura_df


def split_wura_validation_all_langs():
    languages = ["yor", "igbo", "hau"]
    dfs = {}
    for lang in languages:
        wura_lang = "ibo" if lang == "igbo" else lang
        dataset = load_dataset("castorini/wura", wura_lang, level="document", trust_remote_code=True)
        validation_data = dataset.get("validation")
        if not validation_data:
            raise ValueError(f"Dataset {wura_lang} does not have a validation split. Only found {dataset.keys()} splits.")
        lang_df = prepare_wura(validation_data)
        lang_df.rename(columns={"text": "pos", "title": "query"}, inplace=True)
        eval_df, test_df = train_test_split(lang_df, test_size=0.4, random_state=SEED, shuffle=True)
        eval_df.to_json(f"{lang}_eval_dataset.jsonl", orient="records", lines=True)
        test_df.to_json(f"{lang}_test_dataset.jsonl", orient="records", lines=True)
        dfs[lang] = {
            "eval": eval_df,
            "test": test_df
        }
    return dfs

In [6]:
domain = "bbc.com"
# df[df["domain_name"] == domain].url.tolist()
# df[df["domain_name"] == domain].head(15).pos.tolist()
# df[df["domain_name"] == domain].head(15).url.tolist()

In [7]:
from urllib.parse import urlparse


def extract_domain_name(url):
    try:
        parsed_url = urlparse(url)
        netloc = str(parsed_url.netloc)
        return netloc.strip("www.")
    except ValueError:
        return None

In [9]:
def wura_remove_validation_rows(df, wura_ds):
    """Checks for rows in df that exist in wura_ds, using the url, then drops them"""
    wura_val_urls = wura_ds["url"]
    wura_val_urls = {url.strip("/") + "/" for url in wura_val_urls}

    def format_url(row):
        if pd.isna(row.url):
            row.url = ""
            return row
        else:
            row.url = row.url.strip("/") + "/"
            return row

    df = df.apply(lambda row: format_url(row), axis=1)
    df = df[~df.url.isin(wura_val_urls)].reset_index(drop=True)

    return df

def make_wura_df(wura_ds):
    is_headline_valid = lambda value: len((value or " ").split()) > 5
    is_url_valid = lambda value: len((value or " ").strip()) > 5

    data = []

    for row in wura_ds:
        if not (is_headline_valid(row["headline"]) and is_url_valid(row["url"])):
            continue

        data.append({
            "title": row["headline"], "sub_topic": None,
            "url": row["url"].strip("/") + "/", "text": row["content"],
            "category": row["category"]
        })

    wura_df = pd.DataFrame(data)
    return wura_df


def align_with_wura(df, wura_data):
    df = wura_remove_validation_rows(df, wura_data["validation"])
    # Combined collected dataset with Wura train dataset
    # wura_df = make_wura_df(wura_data["train"])
    wura_df = prepare_wura(wura_data["train"])

    df_urls = set(df.url)
    seen_rows = wura_df.url.isin(df_urls)
    new_wura_df = wura_df[~seen_rows]
    old_wura_df = wura_df[seen_rows]
    df = pd.concat([df, new_wura_df])
    # Extracting the category data available in Wura, so we don't miss out on that data
    df["category"] = df["url"].map(old_wura_df.set_index("url")["category"])
    return df


def unify_datasources(dfs: list, wura_data):
    for df in dfs:
        df.columns = df.columns.str.lower()
        if "sub_topic" not in df.columns:
            df["sub_topic"] = None

    df = pd.concat(dfs)
    df = align_with_wura(df, wura_data)

    # dropna for title and text columns
    key_columns = ["title", "text"]
    df.dropna(subset=key_columns, inplace=True)
    return df


def make_yoruba_df():
    """Combines collected dataset with the wura dataset, ensuring the urls from collected dataset do not appear in wura validation."""
    wura_data = load_dataset("castorini/wura", "yor", level="document", verification_mode="no_checks", trust_remote_code=True)
    df1 = pd.read_csv('alaroye_mato_10k.tsv', delimiter="\t")
    df2 = pd.read_csv('von_mato_6k.tsv', delimiter="\t")
    df3 = pd.read_csv('masakhanews_1k.tsv', delimiter="\t")

    df2.rename(columns={'link': 'url'}, inplace=True)
    df3.rename(columns={'headline': 'title'}, inplace=True)

    df = unify_datasources([df1, df2, df3], wura_data)
    return df


def make_igbo_df():
    """Combines collected dataset with the wura dataset, ensuring the urls from collected dataset do not appear in wura validation."""
    wura_data = load_dataset("castorini/wura", "ibo", level="document", verification_mode="no_checks", trust_remote_code=True)
    df1 = pd.read_csv("igbo_mato_3k.tsv", delimiter="\t")

    df1.rename(columns={"link": "url"}, inplace=True)
    df = unify_datasources([df1], wura_data)

    return df


def make_hausa_df():
    wura_data = load_dataset("castorini/wura", "hau", level="document", verification_mode="no_checks", trust_remote_code=True)
    df1 = pd.read_csv("hausa_mato_81k.tsv", delimiter="\t")
    # Key to note that drop duplicates is being done.
    # Later on, this should be handled better. DUplicates are being dropped here to avoid potentially
    # using the same link as a negative, as at the moment, negatives are being sampled using n-1.
    df1 = df1.drop_duplicates(["link"])
    df1.rename(columns={"link": "url"}, inplace=True)
    df = unify_datasources([df1], wura_data)

    return df


def make_igbo_df_v0():
    df = pd.read_csv("igbo_mato_3k.tsv", delimiter="\t")
    df = df[~(df.title.isna() | df.text.isna())]
    df.rename(columns={"link": "url"}, inplace=True)
    df[["sub_topic", "category"]] = None
    return df

def make_hausa_df_v0():
    df = pd.read_csv("hausa_mato_81k.tsv", delimiter="\t")
    df = df[~(df.title.isna() | df.text.isna())]
    # Key to note that drop duplicates is being done.
    # Later on, this should be handled better. DUplicates are being dropped here to avoid potentially
    # using the same link as a negative, as at the moment, negatives are being sampled using n-1.
    df = df.drop_duplicates(["link"])
    df.rename(columns={"link": "url"}, inplace=True)
    df["category"] = None
    return df

In [10]:
# split_key = "train"

# domains = {extract_domain_name(row["url"]) for row in data[split_key]}

# data_split = data[split_key].add_column("domain", [extract_domain_name(row["url"]) for row in data[split_key]])

# # weird_domains = {"smartkidparenting.com", "transferservice-basel.ch"}

# is_valid_value = lambda value: len((value or " ").strip()) > 5

# titled_rows = [row for row in data_split if is_valid_value(row["headline"]) and is_valid_value(row["url"])]

# titled_domains = {}

# for row in titled_rows:
#     url = titled_domains.get(row["domain"], set())
#     url.add(row["url"])
#     titled_domains[row["domain"]] = url

# crawled = {"yoruba.von.gov.ng", "bbc.com", "alaroye.org"}
# crawled_complement = set(titled_domains.keys()).difference(crawled)
# eval_data = [row for row in data_split if row["domain"] in crawled_complement]

In [11]:
def make_dataset_v2(df, duplicate_rows=False):
    """In this version of make dataset, we duplicate rows that have title and subtopic, using the title as query in one and subtopic as query in the other."""
    df_count = len(df)
    df["neg"] = None
    def pick_negative_values(row):
        picked = False
        neg = row.neg
        if not neg:
            size = 7
            neg = []
        else:
            neg = [neg]
            size = 6

        while not picked:
            indexes = np.random.choice(df_count, size=size, replace=False)
            if row.name not in indexes:
                picked = True

        new_neg = neg + df.iloc[indexes].pos.tolist()
        return new_neg

    df.rename(columns={"text": "pos", "title": "query"}, inplace=True)
    df["neg"] = df.apply(lambda row: pick_negative_values(row), axis=1)
    # Extracting subtopics and using them as a query in duplicate rows
    rows_wo_subtopic = df["sub_topic"].isna()
    if duplicate_rows:
        sub_topic_df = df[~rows_wo_subtopic].copy()
        sub_topic_df.loc[:, "query"] = sub_topic_df.loc[:, "sub_topic"]
        df = pd.concat([df, sub_topic_df])
    else:
        df.loc[~rows_wo_subtopic, "query"] = df[~rows_wo_subtopic].sub_topic

    # The BGE M3 expects a list of values
    df["pos"] = df["pos"].apply(lambda x: [x])
    df = df.loc[:, ["query", "pos", "neg"]]
    seed = 42
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_json("dataset.jsonl", orient="records", lines=True)
    print(df.info())

    train_df, eval_df = train_test_split(df, test_size=0.1, random_state=seed, shuffle=True)
    train_df.to_json("train_dataset.jsonl", orient="records", lines=True)
    eval_df.to_json("eval_dataset.jsonl", orient="records", lines=True)


def make_dataset_v3(df, duplicate_rows=False, filename="train_dataset.jsonl"):
    """In this version of make dataset, no longer split into train and eval, because eval and test datasets are currently gotten from wura."""
    df_count = len(df)
    df["neg"] = None
    def pick_negative_values(row):
        picked = False
        neg = row.neg
        if not neg:
            size = 7
            neg = []
        else:
            neg = [neg]
            size = 6

        while not picked:
            indexes = np.random.choice(df_count, size=size, replace=False)
            if row.name not in indexes:
                picked = True

        new_neg = neg + df.iloc[indexes].pos.tolist()
        return new_neg

    df.rename(columns={"text": "pos", "title": "query"}, inplace=True)
    df["neg"] = df.apply(lambda row: pick_negative_values(row), axis=1)
    # Extracting subtopics and using them as a query in duplicate rows
    rows_wo_subtopic = df["sub_topic"].isna()
    if duplicate_rows:
        sub_topic_df = df[~rows_wo_subtopic].copy()
        sub_topic_df.loc[:, "query"] = sub_topic_df.loc[:, "sub_topic"]
        df = pd.concat([df, sub_topic_df])
    else:
        df.loc[~rows_wo_subtopic, "query"] = df[~rows_wo_subtopic].sub_topic

    # The BGE M3 expects a list of values
    df["pos"] = df["pos"].apply(lambda x: [x])
    df = df.loc[:, ["query", "pos", "neg"]]
    seed = 42
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_json(filename, orient="records", lines=True)

In [12]:
def make_dataset():
    # masakhanews_1k.tsv is from Masakhanews
    df1 = pd.read_csv('masakhanews_1k.tsv', delimiter="\t").drop_duplicates(["headline", "text"])
    df1.dropna(inplace=True)
    df1.rename(columns={'headline': 'query', 'text': 'pos'}, inplace=True)
    df1.drop(columns=["category", "url"], inplace=True)
    df1["neg"] = None

    # alaroye_mato_10k.tsv is from AbdulMatin's crawl of Alaroye
    df2 = pd.read_csv('alaroye_mato_10k.tsv', delimiter="\t").drop_duplicates(["Url"])
    df2.dropna(inplace=True)
    df2.rename(columns={'Title': 'query', 'Text': 'pos'}, inplace=True)
    df2.drop(columns=["Url"], inplace=True)
    df2["neg"] = None

    # von_mato_6k.tsv is from AbdulMatin's crawl of VON
    df3 = pd.read_csv('von_mato_6k.tsv', delimiter="\t").drop_duplicates(["link"])
    df3.dropna(inplace=True)
    df3.rename(columns={'sub_topic': 'query', 'text': 'pos'}, inplace=True)
    df3.drop(columns=["title", "link"], inplace=True)
    df3["neg"] = None

    df = pd.concat([df1, df2, df3])

    df_count = len(df)
    def pick_negative_values(row):
        picked = False
        neg = row.neg
        if not neg:
            size = 7
            neg = []
        else:
            neg = [neg]
            size = 6

        while not picked:
            indexes = np.random.choice(df_count, size=size, replace=False)
            if row.name not in indexes:
                picked = True

        new_neg = neg + df.iloc[indexes].pos.tolist()
        return new_neg

    # Apply function to each row
    seed = 42
    df["neg"] = df.apply(lambda row: pick_negative_values(row), axis=1)
    df["pos"] = df["pos"].apply(lambda x: [x])
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_json("dataset.jsonl", orient="records", lines=True)
    print(df.info())

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True)
    train_df.to_json("train_dataset.jsonl", orient="records", lines=True)

    test_df, eval_df = train_test_split(test_df, test_size=0.5, random_state=seed, shuffle=True)
    eval_df.to_json("eval_dataset.jsonl", orient="records", lines=True)
    test_df.to_json("test_dataset.jsonl", orient="records", lines=True)

In [13]:
import shutil


def combine_wura_with_all_mato_igbo():
    """Just adding igbo data to yoruba's train data, to see if it improves quality of yoruba data."""
    # Creates train_dataset.jsonl, dataset.jsonl and eval_dataset.jsonl. But dataset.jsonl is the important one.
    make_dataset_v2(make_igbo_df_v0())
    # Overwrite the train_dataset.jsonl
    combine_wura_train = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/combined_wura/train_dataset.jsonl"
    shutil.copyfile(combine_wura_train, "train_dataset.jsonl")

    data = []
    with jsonlines.open("train_dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)
    with jsonlines.open("dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)

    import random

    random.seed(42)
    random.shuffle(data)

    with jsonlines.open("train_dataset.jsonl", "w") as writer:
        writer.write_all(data)


def combine_wura_with_all_mato_igbo_hausa():
    """Just adding igbo+hausa data to yoruba's train data, to see if it improves quality of yoruba data."""
    # Creates train_dataset.jsonl, dataset.jsonl and eval_dataset.jsonl. But dataset.jsonl is the important one.
    df = pd.concat([make_igbo_df_v0(), make_hausa_df_v0()])
    make_dataset_v2(df)
    # Overwrite the train_dataset.jcomsonl
    combine_wura_train = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/combined_wura/train_dataset.jsonl"
    shutil.copyfile(combine_wura_train, "train_dataset.jsonl")

    data = []
    with jsonlines.open("train_dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)
    with jsonlines.open("dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)

    import random

    random.seed(42)
    random.shuffle(data)

    with jsonlines.open("train_dataset.jsonl", "w") as writer:
        writer.write_all(data)

    hausa_igbo_comwura_train = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/hausa_igbo_comwura/train_dataset.jsonl"
    shutil.copyfile("train_dataset.jsonl", hausa_igbo_comwura_train)


def make_incremental_igbo_hausa_eval_datasets():
    """Samples from the hausa and igbo wura train datasets"""
    def make_incremental(lang_id):
        data = load_dataset("castorini/wura", lang_id, level="document", verification_mode="no_checks", trust_remote_code=True)
        dataset = make_wura_df(data["train"])

        random.seed(SEED)
        eval_idxs = random.sample(range(len(dataset)), 2000)
        eval_dataset = dataset.iloc[eval_idxs]

        eval_dataset.rename(columns={"text": "pos", "title": "query"}, inplace=True)
        eval_dataset.to_json(f"{lang_id}_eval_dataset.jsonl", orient="records", lines=True)

    make_incremental("ibo")
    make_incremental("hau")

    eval_dataset = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/igbo/eval_dataset.jsonl"
    shutil.copyfile("ibo_eval_dataset.jsonl", eval_dataset)

    eval_dataset = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/hausa/eval_dataset.jsonl"
    shutil.copyfile("hau_eval_dataset.jsonl", eval_dataset)

In [5]:
import json
import uuid
from pathlib import Path
import jsonlines


def text_to_guid(text: str) -> str:
    """
    Generate a deterministic GUID (UUID v5) from a given text.
    """
    namespace = uuid.NAMESPACE_DNS  # Standard namespace, or use your own UUID
    return str(uuid.uuid5(namespace, text))


def format_evaluation_jsonl(filepath):
    filepath = Path(filepath)
    lines = []
    with jsonlines.open(filepath) as reader:
        for obj in reader:
            lines.append(obj)


    dataset = {"queries": {}, "corpus": {}, "relevant_docs": {}, "mode": "text"}

    for line in lines:
        query_id = text_to_guid(line["query"])
        if isinstance(line["pos"], str):
            pos = line["pos"]
        elif isinstance(line["pos"], list):
            pos = line["pos"][0]
        else:
            raise ValueError(f"Unexpected type for 'pos': {type(line['pos'])}. Expected a list or string.")

        pos_id = text_to_guid(pos)
        dataset["queries"][query_id] = line["query"]
        dataset["corpus"][pos_id] = pos
        dataset["relevant_docs"][query_id] = [pos_id]

    new_path = filepath.parent / (filepath.stem + "_formatted.json")
    with open(new_path, "w") as f:
        json.dump(dataset, f, indent=4)

In [16]:
# df = split_wura_validation_all_langs()

# !cp hau_eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/hausa/eval_dataset.jsonl
# !cp hau_test_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/hausa/test_dataset.jsonl

# !cp igbo_eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/igbo/eval_dataset.jsonl
# !cp igbo_test_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/igbo/test_dataset.jsonl

# !cp yor_eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/yoruba/eval_dataset.jsonl
# !cp yor_test_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/yoruba/test_dataset.jsonl

In [1]:
#make_dataset_v3(make_yoruba_df(), filename="yor_train_dataset.jsonl")
#make_dataset_v3(make_igbo_df(), filename="igbo_train_dataset.jsonl")
#make_dataset_v3(make_hausa_df(), filename="hausa_train_dataset.jsonl")
# format_evaluation_jsonl("eval_dataset.jsonl")

# OR


# Download dataset
# !gdown https://drive.google.com/uc?id=1xJ6EHSyaZeMtosQ7RF_R9OHJssuXl0Eq

# !gdown https://drive.google.com/uc?id=1qR1n_kb5mtCfbAPitw3bffRKQtN0H-ZL


# !gdown https://drive.google.com/uc?id=10RHg1qWjopgjo0Ns0TZ53zhhAmVuO6u4

In [20]:
!gdown https://drive.google.com/uc?id=1-9s4lsREcIemnva5yMzsTng2cwyvUk0n

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading...
From (original): https://drive.google.com/uc?id=1-9s4lsREcIemnva5yMzsTng2cwyvUk0n
From (redirected): https://drive.google.com/uc?id=1-9s4lsREcIemnva5yMzsTng2cwyvUk0n&confirm=t&uuid=4b646be8-cd18-4ca9-b0df-a7a2f47c808b
To: /home/omotoso.abdulmatin4/filtered_english_train_dataset.jsonl
100%|█████████████████████████████████████████| 706M/706M [00:06<00:00, 108MB/s]


In [None]:
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/hausa_train_dataset.jsonl .
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/igbo_train_dataset.jsonl .
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/yoruba_train_dataset.jsonl .

In [None]:
data = []

with jsonlines.open("yoruba_train_dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

with jsonlines.open("hausa_train_dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

with jsonlines.open("igbo_train_dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

In [None]:
sizes = [10_000, 50_000, 100_000, 300_000]

for size in sizes:
    with jsonlines.open(f"{size}_train_dataset.jsonl", "w") as writer:
        writer.write_all(data[:size])

# 10k
gdown https://drive.google.com/uc?id=1qR1n_kb5mtCfbAPitw3bffRKQtN0H-ZL
# 50k
gdown https://drive.google.com/uc?id=1-2UiPWc6Z0Qn0coN1yYIgOSciNFcEncB
# 100k
gdown https://drive.google.com/uc?id=1-4WNTv69iQR528_lS7iKQxo24jkBnECr

In [None]:
!cp 10000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/10000_train_dataset.jsonl
!cp 50000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/50000_train_dataset.jsonl
!cp 100000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/100000_train_dataset.jsonl
!cp 300000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/300000_train_dataset.jsonl

In [None]:
!ls /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/combine_wura_all_langs/

In [None]:
len(data)

In [None]:
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/hausa_mato_81k.tsv .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/igbo_mato_3k.tsv .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/alaroye_mato_10k.tsv .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/von_mato_6k.tsv .

In [None]:
# !cp train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets
# !cp eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets
# !cp train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/igbo_comwura/
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/hausa/eval_dataset.jsonl .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/hausa_igbo_comwura/train_dataset.jsonl .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combined_wura/eval_dataset.jsonl .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/test_dataset.jsonl .

In [None]:
format_evaluation_jsonl("eval_dataset.jsonl")

In [3]:
!wget https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/refs/heads/master/examples/finetune/ds_stage0.json

--2025-04-21 18:54:46--  https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/refs/heads/master/examples/finetune/ds_stage0.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 963 [text/plain]
Saving to: ‘ds_stage0.json.4’


2025-04-21 18:54:46 (105 MB/s) - ‘ds_stage0.json.4’ saved [963/963]



In [21]:
import json
import random

# Input files
input_files = [
    '/home/omotoso.abdulmatin4/filtered_hausa_train_dataset.jsonl',
    '/home/omotoso.abdulmatin4/filtered_igbo_train_dataset.jsonl',
    '/home/omotoso.abdulmatin4/filtered_yoruba_train_dataset.jsonl',
    '/home/omotoso.abdulmatin4/filtered_english_train_dataset.jsonl'
]

# Output file
output_file = 'combined_english_shuffled_dataset.jsonl'

# Read and combine all lines
all_data = []
for file_path in input_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            all_data.append(json.loads(line.strip()))

# Shuffle the combined data
random.shuffle(all_data)

# Write the shuffled data to the output file
with open(output_file, 'w', encoding='utf-8') as f:
    for item in all_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"Combined and shuffled {len(all_data)} entries into '{output_file}'")


Combined and shuffled 180530 entries into 'combined_english_shuffled_dataset.jsonl'


In [22]:
# # Train a model, terminal command
import re

command = """
torchrun --standalone --nproc_per_node 8 \
-m FlagEmbedding.finetune.embedder.encoder_only.m3 \
--model_name_or_path BAAI/bge-m3 \
--output_dir ./bge-m3 \
--cache_dir ./cache/model \
--cache_path ./cache/data \
--train_data /home/omotoso.abdulmatin4/combined_english_shuffled_dataset.jsonl \
--trust_remote_code True \
--train_group_size 2 \
--query_max_len 512 \
--passage_max_len 2048 \
--overwrite_output_dir \
--learning_rate 1e-5 \
--fp16 \
--dataloader_num_workers 12 \
--gradient_checkpointing \
--deepspeed ds_stage0.json \
--num_train_epochs 3 \
--per_device_train_batch_size 8 \
--dataloader_drop_last False \
--warmup_ratio 0.1 \
--report_to none \
--logging_steps 100 \
--save_steps 500 \
--temperature 0.01 \
--sentence_pooling_method cls \
--normalize_embeddings True \
--knowledge_distillation False \
--kd_loss_type m3_kd_loss \
--unified_finetuning False \
--use_self_distill False \
--fix_encoder False"""

command = re.sub(r'\\\n\s+', '', command)

print(command)

# OR

# Download existing model weights
# !gdown https://drive.google.com/uc?id=1hC2nReprpHpCNWq9yergzGJLSHz_VKia
# !tar -xzvf bge-m3-5-epochs-unified.tar.gz

#gdown https://drive.google.com/uc?id=1-2UiPWc6Z0Qn0coN1yYIgOSciNFcEncB
#gdown https://drive.google.com/uc?id=1-4WNTv69iQR528_lS7iKQxo24jkBnECr


torchrun --standalone --nproc_per_node 8 -m FlagEmbedding.finetune.embedder.encoder_only.m3 --model_name_or_path BAAI/bge-m3 --output_dir ./bge-m3 --cache_dir ./cache/model --cache_path ./cache/data --train_data /home/omotoso.abdulmatin4/combined_english_shuffled_dataset.jsonl --trust_remote_code True --train_group_size 2 --query_max_len 512 --passage_max_len 2048 --overwrite_output_dir --learning_rate 1e-5 --fp16 --dataloader_num_workers 12 --gradient_checkpointing --deepspeed ds_stage0.json --num_train_epochs 3 --per_device_train_batch_size 8 --dataloader_drop_last False --warmup_ratio 0.1 --report_to none --logging_steps 100 --save_steps 500 --temperature 0.01 --sentence_pooling_method cls --normalize_embeddings True --knowledge_distillation False --kd_loss_type m3_kd_loss --unified_finetuning False --use_self_distill False --fix_encoder False


In [23]:
!{command}

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


W0505 07:29:57.464000 167865 torch/distributed/run.py:792] 
W0505 07:29:57.464000 167865 torch/distributed/run.py:792] *****************************************
W0505 07:29:57.464000 167865 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0505 07:29:57.464000 167865 torch/distributed/run.py:792] *****************************************
[2025-05-05 07:30:03,362] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-05-05 07:30:03,364] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-05-05 07:30:03,366] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2025-05-05 07:30:03,470] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (au

In [1]:
import torch
import torchvision
print(torch.__version__)
print(torchvision.__version__)


  from .autonotebook import tqdm as notebook_tqdm


2.1.0+cu118
0.16.0+cu118


In [24]:


#model_id = "bge-m3-yoruba-igbo-hausa-alldatatogether-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize"
#!tar --exclude='global_*' -czvf {model_id}.tar.gz ./bge-m3/checkpoint-3000
#!cp {model_id}.tar.gz /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/experiments/model_weights/

model_id = "bge-m3-yoruba-igbo-hausa-english-alldatatogether-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize"
!tar --exclude='./bge-m3/checkpoint-*' -czvf {model_id}.tar.gz ./bge-m3
#!cp {model_id}.tar.gz /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/experiments/model_weights/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


./bge-m3/
./bge-m3/tokenizer_config.json
./bge-m3/model.safetensors
./bge-m3/tokenizer.json
./bge-m3/sentencepiece.bpe.model
./bge-m3/config.json
./bge-m3/training_args.bin
./bge-m3/special_tokens_map.json


In [25]:
model_ids = [
    "bge-m3-yoruba-igbo-hausa-english-alldatatogether-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize",

]

for id_ in model_ids:
    #!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/experiments/model_weights/{id_}.tar.gz .
    !mkdir {id_}
    !tar -xzvf {id_}.tar.gz -C {id_}
    pass

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


./bge-m3/
./bge-m3/tokenizer_config.json
./bge-m3/model.safetensors
./bge-m3/tokenizer.json
./bge-m3/sentencepiece.bpe.model
./bge-m3/config.json
./bge-m3/training_args.bin
./bge-m3/special_tokens_map.json


In [3]:
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import util

from FlagEmbedding import BGEM3FlagModel
from FlagEmbedding.inference.embedder.encoder_only.m3 import M3Embedder
import tqdm

[2025-05-08 10:44:35,208] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio
collect2: error: ld returned 1 exit status
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlvsym'
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `shm_open'
/usr/bin/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `shm_unlink'
collect2: error: ld returned 1 exit status


In [4]:
def mean_pooling(model_output):
    return torch.mean(model_output["last_hidden_state"], dim=1)

def cls_pooling(model_output):
    return model_output[0][:, 0]

def last_token_pooling(model_output):
    return model_output[0][:, -1]

def get_sentence_embedding(text, tokenizer, embed_model, normalize, max_length, pooling_type='cls'):

    if pooling_type=="last_token":
        encoded_input = tokenizer(text, max_length=max_length, return_attention_mask=False, padding=False, truncation=True)
        encoded_input['input_ids'] = encoded_input['input_ids'] + [tokenizer.eos_token_id]
        encoded_input = tokenizer.pad([encoded_input], padding=True, return_attention_mask=True, return_tensors='pt').to("cuda")
    else:
        encoded_input = tokenizer(text, return_tensors="pt", max_length=max_length, truncation=True).to("cuda")

    with torch.no_grad():
        model_output = embed_model(**encoded_input)

    if pooling_type=="cls":
        sentence_embeddings = cls_pooling(model_output)
    if pooling_type=="mean":
        sentence_embeddings = mean_pooling(model_output)
    if pooling_type=="last_token":
        sentence_embeddings = last_token_pooling(model_output)

    if normalize:
        sentence_embeddings = F.normalize(sentence_embeddings)

    return sentence_embeddings

In [18]:
import tiktoken
def truncate_text(text, model="text-embedding-3-large", max_tokens=8192):
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return enc.decode(tokens)

In [19]:
import openai
from openai import OpenAI

def get_openai_embedding(text, model="text-embedding-3-large", normalize=True):
    text = truncate_text(text, model=model, max_tokens=8192)
   
    client = OpenAI(api_key="sk-proj-LdGUWAh6BUL4USMR7b73T3BlbkFJN5dlooT1de77qvfu7FPX")

    response = client.embeddings.create(
        input=[text],
        model=model,
    )
    embedding = torch.tensor(response.data[0].embedding)

    if normalize:
        embedding = torch.nn.functional.normalize(embedding.unsqueeze(0), p=2, dim=1)
    else:
        embedding = embedding.unsqueeze(0)
    return embedding


In [20]:
def evaluate(qa_dataset, tokenizer, model_name, embed_model, normalize, max_length=None, pooling_type="cls", top_k=5, verbose=False):
    input_texts = list(qa_dataset.corpus.values())
    input_text_keys = list(qa_dataset.corpus.keys())
    queries = qa_dataset.queries
    relevant_docs = qa_dataset.relevant_docs

    embeddings = []
    for sentence in input_texts:
        if model_name == "intfloat/multilingual-e5-large":
            sentence = f"passage: {sentence}"

        if embed_model == "openai":
            embedding = get_openai_embedding(sentence, model=model_name, normalize=normalize)
        else:
            embedding = get_sentence_embedding(sentence, tokenizer, embed_model, normalize, max_length, pooling_type)
        
        embeddings.append(embedding)

    embeddings = torch.cat(embeddings)

    eval_results = []

    for query_id, query in tqdm(queries.items()):
        if model_name == "intfloat/multilingual-e5-large":
            query = f"query: {query}"

        if embed_model == "openai":
            query_embedding = get_openai_embedding(query, model=model_name, normalize=normalize)
        else:
            query_embedding = get_sentence_embedding(query, tokenizer, embed_model, normalize, max_length, pooling_type)

        results = util.semantic_search(query_embedding, embeddings, top_k=top_k)[0]
        retrieved_ids = [input_text_keys[int(result["corpus_id"])] for result in results]

        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids

        if is_hit:
            rank = retrieved_ids.index(expected_id) + 1
            mrr = 1 / rank
        else:
            mrr = 0
        eval_results.append(mrr)

    return np.average(eval_results)


In [21]:
embeddings_model_spec = {
}

embeddings_model_spec['ML-E5-large']={'model_name':'intfloat/multilingual-e5-large','max_length':512, 'pooling_type':'mean',
                                      'normalize': True, 'batch_size':8, 'kwargs': {'device_map': 'cuda', 'torch_dtype':torch.float16}}
embeddings_model_spec['BGE-M3']={'model_name':'BAAI/bge-m3','max_length':8192, 'pooling_type':'cls', 'vector_type': 'multi-vector',
                                 'normalize': True, 'batch_size':8, 'kwargs': {'device_map': 'cuda', 'torch_dtype':torch.float16}}
embeddings_model_spec['LaBSE']={'model_name':'sentence-transformers/LaBSE','max_length':256, 'pooling_type':'cls',
                                 'normalize': True, 'batch_size':8, 'kwargs': {'device_map': 'cuda', 'torch_dtype':torch.float16}}
#embeddings_model_spec["BGE-M3-yoruba-alldata-Epochs-3"]={'model_name':'bge-m3-yoruba-alldata-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize/bge-m3','max_length':8192, 'pooling_type':'cls', 'vector_type': 'multi-vector',
                                 #'normalize': True, 'batch_size':8, 'kwargs': {'device_map': 'cuda', 'torch_dtype':torch.float16}}
#embeddings_model_spec["BGE-M3-yoruba-igbo-alldata-Epochs-3"]={'model_name':'bge-m3-yoruba-igbo-alldata-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize/bge-m3','max_length':8192, 'pooling_type':'cls', 'vector_type': 'multi-vector',
                                 #'normalize': True, 'batch_size':8, 'kwargs': {'device_map': 'cuda', 'torch_dtype':torch.float16}}
embeddings_model_spec["BGE-M3-yoruba-igbo-hausa-english-alldatatogether-Epochs-3"]={'model_name':'bge-m3-yoruba-igbo-hausa-english-alldatatogether-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize/bge-m3','max_length':8192, 'pooling_type':'cls', 'vector_type': 'multi-vector',
                                 'normalize': True, 'batch_size':8, 'kwargs': {'device_map': 'cuda', 'torch_dtype':torch.float16}}
embeddings_model_spec["OpenAI-text-embedding-3-large"] = {
    'model_name': 'text-embedding-3-large',
    'max_length': 8192,  # OpenAI has high token limits
    'pooling_type': 'cls',  # Not used, just kept for compatibility
    'normalize': True,
    'batch_size': 1,
    'kwargs': {},  # Not used
}


In [7]:
import json
import uuid
from pathlib import Path
import jsonlines


def text_to_guid(text: str) -> str:
    """
    Generate a deterministic GUID (UUID v5) from a given text.
    """
    namespace = uuid.NAMESPACE_DNS  # Standard namespace, or use your own UUID
    return str(uuid.uuid5(namespace, text))


def format_evaluation_jsonl(filepath):
    filepath = Path(filepath)
    lines = []
    with jsonlines.open(filepath) as reader:
        for obj in reader:
            lines.append(obj)


    dataset = {"queries": {}, "corpus": {}, "relevant_docs": {}, "mode": "text"}

    for line in lines:
        query_id = text_to_guid(line["query"])
        if isinstance(line["pos"], str):
            pos = line["pos"]
        elif isinstance(line["pos"], list):
            pos = line["pos"][0]
        else:
            raise ValueError(f"Unexpected type for 'pos': {type(line['pos'])}. Expected a list or string.")

        pos_id = text_to_guid(pos)
        dataset["queries"][query_id] = line["query"]
        dataset["corpus"][pos_id] = pos
        dataset["relevant_docs"][query_id] = [pos_id]

    new_path = filepath.parent / (filepath.stem + "_formatted.json")
    with open(new_path, "w") as f:
        json.dump(dataset, f, indent=4)

In [16]:
!gdown https://drive.google.com/uc?id=19ddg2yh5NPj55-5U8yN1ev-uN6je0lUH

Downloading...
From: https://drive.google.com/uc?id=19ddg2yh5NPj55-5U8yN1ev-uN6je0lUH
To: /home/omotoso.abdulmatin4/Aremu_YO_dataset.json
100%|████████████████████████████████████████| 139k/139k [00:00<00:00, 85.0MB/s]


In [None]:
# Yoruba
!gdown https://drive.google.com/uc?id=1--w3T7vraOUZ3vuD_PF32uLzgq4qh9Ds -O yoruba_test_dataset.jsonl

# Igbo
!gdown https://drive.google.com/uc?id=1J8FcO2F5h9a64Rb7N3QcVf9wqzaMs23p -O igbo_test_dataset.jsonl

# Hausa
!gdown https://drive.google.com/uc?id=1-8eWLDk9lWAx-OSE5tjQES89prhcn2yJ -O hausa_test_dataset.jsonl

#English
!gdown https://drive.google.com/uc?id=1UDAxYEGOXRLMjEp9me3iAiuKKvUXwlwv -O english_test_dataset.jsonl

Downloading...
From: https://drive.google.com/uc?id=1UDAxYEGOXRLMjEp9me3iAiuKKvUXwlwv
To: /home/omotoso.abdulmatin4/english_test_dataset.jsonl
100%|███████████████████████████████████████| 47.4M/47.4M [00:00<00:00, 115MB/s]


In [None]:
format_evaluation_jsonl("yoruba_test_dataset.jsonl")
format_evaluation_jsonl("igbo_test_dataset.jsonl")
format_evaluation_jsonl("hausa_test_dataset.jsonl")
format_evaluation_jsonl("english_test_dataset.jsonl")

In [28]:
import time
import numpy as np
from tqdm import tqdm
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

results = []



#languages = ["EN", "FR", "CS", "HU"]

for key, model_spec in embeddings_model_spec.items():
    print(key)
    print("Processing model : "+str(model_spec))

    if "OpenAI" in key:
        tokenizer = None
        embed_model = "openai"
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_spec['model_name'])
        embed_model = AutoModel.from_pretrained(model_spec['model_name'], **model_spec['kwargs'])

    file_name = "/home/omotoso.abdulmatin4/english_test_dataset_formatted.json"
    qa_dataset = EmbeddingQAFinetuneDataset.from_json(file_name)

    start_time_assessment = time.time()
    score = evaluate(
        qa_dataset,
        tokenizer,
        model_spec['model_name'],
        embed_model,
        model_spec['normalize'],
        model_spec['max_length'],
        model_spec['pooling_type']
    )
    duration_assessment = time.time() - start_time_assessment
    results.append([key, score, duration_assessment])


ML-E5-large
Processing model : {'model_name': 'intfloat/multilingual-e5-large', 'max_length': 512, 'pooling_type': 'mean', 'normalize': True, 'batch_size': 8, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}


100%|██████████| 1998/1998 [00:31<00:00, 64.37it/s]


BGE-M3
Processing model : {'model_name': 'BAAI/bge-m3', 'max_length': 8192, 'pooling_type': 'cls', 'vector_type': 'multi-vector', 'normalize': True, 'batch_size': 8, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}


100%|██████████| 1998/1998 [00:31<00:00, 64.01it/s]


LaBSE
Processing model : {'model_name': 'sentence-transformers/LaBSE', 'max_length': 256, 'pooling_type': 'cls', 'normalize': True, 'batch_size': 8, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}


100%|██████████| 1998/1998 [00:16<00:00, 119.94it/s]


BGE-M3-yoruba-igbo-hausa-english-alldatatogether-Epochs-3
Processing model : {'model_name': 'bge-m3-yoruba-igbo-hausa-english-alldatatogether-3-epochs-e5-lr-0_1-warmup-128-batchsize-0_01-temperature-2-groupsize/bge-m3', 'max_length': 8192, 'pooling_type': 'cls', 'vector_type': 'multi-vector', 'normalize': True, 'batch_size': 8, 'kwargs': {'device_map': 'cuda', 'torch_dtype': torch.float16}}


100%|██████████| 1998/1998 [00:31<00:00, 63.85it/s]


OpenAI-text-embedding-3-large
Processing model : {'model_name': 'text-embedding-3-large', 'max_length': 8192, 'pooling_type': 'cls', 'normalize': True, 'batch_size': 1, 'kwargs': {}}


100%|██████████| 1998/1998 [14:01<00:00,  2.37it/s]  


In [23]:
import pandas as pd
df_yoruba = pd.DataFrame(results, columns = ["Embedding model", "MRR", "Duration"])
print(df_yoruba)

                                     Embedding model       MRR    Duration
0                                        ML-E5-large  0.676636   37.825116
1                                             BGE-M3  0.784685   40.458355
2                                              LaBSE  0.320123   20.648884
3  BGE-M3-yoruba-igbo-hausa-english-alldatatogeth...  0.920138   40.697949
4                      OpenAI-text-embedding-3-large  0.671475  864.519576


In [25]:
df_igbo = pd.DataFrame(results, columns = ["Embedding model", "MRR", "Duration"])
print(df_igbo)

                                     Embedding model       MRR    Duration
0                                        ML-E5-large  0.679562   26.271745
1                                             BGE-M3  0.756658   30.497309
2                                              LaBSE  0.300105   14.305604
3  BGE-M3-yoruba-igbo-hausa-english-alldatatogeth...  0.863864   30.379917
4                      OpenAI-text-embedding-3-large  0.732764  635.219030


In [27]:
df_hausa = pd.DataFrame(results, columns = ["Embedding model", "MRR", "Duration"])
print(df_hausa)

                                     Embedding model       MRR     Duration
0                                        ML-E5-large  0.699266    66.460244
1                                             BGE-M3  0.857545    76.097345
2                                              LaBSE  0.318834    37.391708
3  BGE-M3-yoruba-igbo-hausa-english-alldatatogeth...  0.923062    76.355321
4                      OpenAI-text-embedding-3-large  0.564999  1687.114425


In [29]:
df_english = pd.DataFrame(results, columns = ["Embedding model", "MRR", "Duration"])
print(df_english)

                                     Embedding model       MRR     Duration
0                                        ML-E5-large  0.352686    67.133896
1                                             BGE-M3  0.737788    80.722520
2                                              LaBSE  0.434927    37.643577
3  BGE-M3-yoruba-igbo-hausa-english-alldatatogeth...  0.861778    80.765280
4                      OpenAI-text-embedding-3-large  0.536428  1798.402251


In [30]:
# Add language columns
df_yoruba["Language"] = "Yoruba"
df_igbo["Language"] = "Igbo"
df_hausa["Language"] = "Hausa"
df_english["Language"] = "English"

# Combine all three into one DataFrame
df_all = pd.concat([df_yoruba, df_igbo, df_hausa, df_english], ignore_index=True)

# Compute macroaverage MRR per model
macro_mrr = df_all.groupby("Embedding model")["MRR"].mean().reset_index()
macro_mrr.columns = ["Embedding model", "Macroaverage MRR"]

# Pivot to see per-language and macroaverage side-by-side
pivot = df_all.pivot_table(index="Embedding model", columns="Language", values="MRR")
pivot["Macroaverage MRR"] = macro_mrr.set_index("Embedding model")["Macroaverage MRR"]
pivot = pivot.reset_index()

# Show result
print(pivot)

Language                                    Embedding model   English  \
0                                                    BGE-M3  0.737788   
1         BGE-M3-yoruba-igbo-hausa-english-alldatatogeth...  0.861778   
2                                                     LaBSE  0.434927   
3                                               ML-E5-large  0.352686   
4                             OpenAI-text-embedding-3-large  0.536428   

Language     Hausa      Igbo    Yoruba  Macroaverage MRR  
0         0.857545  0.756658  0.784685          0.784169  
1         0.923062  0.863864  0.920138          0.892211  
2         0.318834  0.300105  0.320123          0.343497  
3         0.699266  0.679562  0.676636          0.602038  
4         0.564999  0.732764  0.671475          0.626417  


In [31]:
pivot

Language,Embedding model,English,Hausa,Igbo,Yoruba,Macroaverage MRR
0,BGE-M3,0.737788,0.857545,0.756658,0.784685,0.784169
1,BGE-M3-yoruba-igbo-hausa-english-alldatatogeth...,0.861778,0.923062,0.863864,0.920138,0.892211
2,LaBSE,0.434927,0.318834,0.300105,0.320123,0.343497
3,ML-E5-large,0.352686,0.699266,0.679562,0.676636,0.602038
4,OpenAI-text-embedding-3-large,0.536428,0.564999,0.732764,0.671475,0.626417
