In [1]:
%%capture
!pip install datasets
!pip install FlagEmbedding[finetune]
!pip install jsonlines
!pip install accelerate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
Downlo

In [1]:
import torch

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, arrow_dataset
from sklearn.model_selection import train_test_split
#import jsonlines
import random

SEED = 42
random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# data = load_dataset("castorini/wura", "yor", level="document", verification_mode="no_checks", trust_remote_code=True)
data = load_dataset("castorini/wura", "ibo", level="document", verification_mode="no_checks", trust_remote_code=True)

In [5]:
def prepare_wura(dataset):
    if not isinstance(dataset, arrow_dataset.Dataset):
        raise ValueError(f"The parameter `dataset` only accepts `arrow_dataset.Dataset` objects. Got {type(dataset)} instead.")

    expected_columns = {"headline", "content", "category", "url"}
    missing_columns = expected_columns.difference(set(dataset.features))
    if missing_columns:
        raise ValueError(f"The dataset must contain all of the following features: {expected_columns}. Missing features: {missing_columns}")

    domain_counts = {}
    for row in dataset:
        domain = extract_domain_name(row["url"])
        domain_counts[domain] = domain_counts.get(domain, 0) + 1

    invalid_domains = {
        "jw.org" # Has really weird links, for example:  https://www.jw.org/yo/elerii-jehofa/kan-si-wa/venezuela/, https://www.jw.org/yo/elerii-jehofa/kan-si-wa/tonga/, https://www.jw.org/yo/elerii-jehofa/kan-si-wa/taiwan/ all have the title "Kan Si Wa"
    }

    is_headline_valid = lambda value: len((value or " ").split()) > 1
    is_url_valid = lambda value: len((value or " ").strip()) > 5
    is_domain_valid = lambda value: domain_counts[value] > 10 and not value in invalid_domains # If the domain does not appear enough times that is a sign that the site is not committed to publishing in the language. So it is probably a weird url or the English was translated using Google translate e.g. https://downloadfacetime.com/facetime/facetime-for-ipad/
    is_text_valid = lambda value: len((value or " ").strip().split()) > 30

    data = []
    for row in dataset:
        if not (is_headline_valid(row["headline"]) \
                and is_url_valid(row["url"]) \
                and is_domain_valid(extract_domain_name(row["url"]))):
            continue

        data.append({
            "title": row["headline"],
            "url": row["url"].strip("/") + "/", "text": row["content"],
            "category": row["category"]
        })

    wura_df = pd.DataFrame(data)
    return wura_df


def split_wura_validation_all_langs():
    languages = ["yor", "igbo", "hau"]
    dfs = {}
    for lang in languages:
        wura_lang = "ibo" if lang == "igbo" else lang
        dataset = load_dataset("castorini/wura", wura_lang, level="document", trust_remote_code=True)
        validation_data = dataset.get("validation")
        if not validation_data:
            raise ValueError(f"Dataset {wura_lang} does not have a validation split. Only found {dataset.keys()} splits.")
        lang_df = prepare_wura(validation_data)
        lang_df.rename(columns={"text": "pos", "title": "query"}, inplace=True)
        eval_df, test_df = train_test_split(lang_df, test_size=0.4, random_state=SEED, shuffle=True)
        eval_df.to_json(f"{lang}_eval_dataset.jsonl", orient="records", lines=True)
        test_df.to_json(f"{lang}_test_dataset.jsonl", orient="records", lines=True)
        dfs[lang] = {
            "eval": eval_df,
            "test": test_df
        }
    return dfs

In [6]:
domain = "bbc.com"
# df[df["domain_name"] == domain].url.tolist()
# df[df["domain_name"] == domain].head(15).pos.tolist()
# df[df["domain_name"] == domain].head(15).url.tolist()

In [7]:
from urllib.parse import urlparse


def extract_domain_name(url):
    try:
        parsed_url = urlparse(url)
        netloc = str(parsed_url.netloc)
        return netloc.strip("www.")
    except ValueError:
        return None

In [9]:
def wura_remove_validation_rows(df, wura_ds):
    """Checks for rows in df that exist in wura_ds, using the url, then drops them"""
    wura_val_urls = wura_ds["url"]
    wura_val_urls = {url.strip("/") + "/" for url in wura_val_urls}

    def format_url(row):
        if pd.isna(row.url):
            row.url = ""
            return row
        else:
            row.url = row.url.strip("/") + "/"
            return row

    df = df.apply(lambda row: format_url(row), axis=1)
    df = df[~df.url.isin(wura_val_urls)].reset_index(drop=True)

    return df

def make_wura_df(wura_ds):
    is_headline_valid = lambda value: len((value or " ").split()) > 5
    is_url_valid = lambda value: len((value or " ").strip()) > 5

    data = []

    for row in wura_ds:
        if not (is_headline_valid(row["headline"]) and is_url_valid(row["url"])):
            continue

        data.append({
            "title": row["headline"], "sub_topic": None,
            "url": row["url"].strip("/") + "/", "text": row["content"],
            "category": row["category"]
        })

    wura_df = pd.DataFrame(data)
    return wura_df


def align_with_wura(df, wura_data):
    df = wura_remove_validation_rows(df, wura_data["validation"])
    # Combined collected dataset with Wura train dataset
    # wura_df = make_wura_df(wura_data["train"])
    wura_df = prepare_wura(wura_data["train"])

    df_urls = set(df.url)
    seen_rows = wura_df.url.isin(df_urls)
    new_wura_df = wura_df[~seen_rows]
    old_wura_df = wura_df[seen_rows]
    df = pd.concat([df, new_wura_df])
    # Extracting the category data available in Wura, so we don't miss out on that data
    df["category"] = df["url"].map(old_wura_df.set_index("url")["category"])
    return df


def unify_datasources(dfs: list, wura_data):
    for df in dfs:
        df.columns = df.columns.str.lower()
        if "sub_topic" not in df.columns:
            df["sub_topic"] = None

    df = pd.concat(dfs)
    df = align_with_wura(df, wura_data)

    # dropna for title and text columns
    key_columns = ["title", "text"]
    df.dropna(subset=key_columns, inplace=True)
    return df


def make_yoruba_df():
    """Combines collected dataset with the wura dataset, ensuring the urls from collected dataset do not appear in wura validation."""
    wura_data = load_dataset("castorini/wura", "yor", level="document", verification_mode="no_checks", trust_remote_code=True)
    df1 = pd.read_csv('alaroye_mato_10k.tsv', delimiter="\t")
    df2 = pd.read_csv('von_mato_6k.tsv', delimiter="\t")
    df3 = pd.read_csv('masakhanews_1k.tsv', delimiter="\t")

    df2.rename(columns={'link': 'url'}, inplace=True)
    df3.rename(columns={'headline': 'title'}, inplace=True)

    df = unify_datasources([df1, df2, df3], wura_data)
    return df


def make_igbo_df():
    """Combines collected dataset with the wura dataset, ensuring the urls from collected dataset do not appear in wura validation."""
    wura_data = load_dataset("castorini/wura", "ibo", level="document", verification_mode="no_checks", trust_remote_code=True)
    df1 = pd.read_csv("igbo_mato_3k.tsv", delimiter="\t")

    df1.rename(columns={"link": "url"}, inplace=True)
    df = unify_datasources([df1], wura_data)

    return df


def make_hausa_df():
    wura_data = load_dataset("castorini/wura", "hau", level="document", verification_mode="no_checks", trust_remote_code=True)
    df1 = pd.read_csv("hausa_mato_81k.tsv", delimiter="\t")
    # Key to note that drop duplicates is being done.
    # Later on, this should be handled better. DUplicates are being dropped here to avoid potentially
    # using the same link as a negative, as at the moment, negatives are being sampled using n-1.
    df1 = df1.drop_duplicates(["link"])
    df1.rename(columns={"link": "url"}, inplace=True)
    df = unify_datasources([df1], wura_data)

    return df


def make_igbo_df_v0():
    df = pd.read_csv("igbo_mato_3k.tsv", delimiter="\t")
    df = df[~(df.title.isna() | df.text.isna())]
    df.rename(columns={"link": "url"}, inplace=True)
    df[["sub_topic", "category"]] = None
    return df

def make_hausa_df_v0():
    df = pd.read_csv("hausa_mato_81k.tsv", delimiter="\t")
    df = df[~(df.title.isna() | df.text.isna())]
    # Key to note that drop duplicates is being done.
    # Later on, this should be handled better. DUplicates are being dropped here to avoid potentially
    # using the same link as a negative, as at the moment, negatives are being sampled using n-1.
    df = df.drop_duplicates(["link"])
    df.rename(columns={"link": "url"}, inplace=True)
    df["category"] = None
    return df

In [10]:
# split_key = "train"

# domains = {extract_domain_name(row["url"]) for row in data[split_key]}

# data_split = data[split_key].add_column("domain", [extract_domain_name(row["url"]) for row in data[split_key]])

# # weird_domains = {"smartkidparenting.com", "transferservice-basel.ch"}

# is_valid_value = lambda value: len((value or " ").strip()) > 5

# titled_rows = [row for row in data_split if is_valid_value(row["headline"]) and is_valid_value(row["url"])]

# titled_domains = {}

# for row in titled_rows:
#     url = titled_domains.get(row["domain"], set())
#     url.add(row["url"])
#     titled_domains[row["domain"]] = url

# crawled = {"yoruba.von.gov.ng", "bbc.com", "alaroye.org"}
# crawled_complement = set(titled_domains.keys()).difference(crawled)
# eval_data = [row for row in data_split if row["domain"] in crawled_complement]

In [11]:
def make_dataset_v2(df, duplicate_rows=False):
    """In this version of make dataset, we duplicate rows that have title and subtopic, using the title as query in one and subtopic as query in the other."""
    df_count = len(df)
    df["neg"] = None
    def pick_negative_values(row):
        picked = False
        neg = row.neg
        if not neg:
            size = 7
            neg = []
        else:
            neg = [neg]
            size = 6

        while not picked:
            indexes = np.random.choice(df_count, size=size, replace=False)
            if row.name not in indexes:
                picked = True

        new_neg = neg + df.iloc[indexes].pos.tolist()
        return new_neg

    df.rename(columns={"text": "pos", "title": "query"}, inplace=True)
    df["neg"] = df.apply(lambda row: pick_negative_values(row), axis=1)
    # Extracting subtopics and using them as a query in duplicate rows
    rows_wo_subtopic = df["sub_topic"].isna()
    if duplicate_rows:
        sub_topic_df = df[~rows_wo_subtopic].copy()
        sub_topic_df.loc[:, "query"] = sub_topic_df.loc[:, "sub_topic"]
        df = pd.concat([df, sub_topic_df])
    else:
        df.loc[~rows_wo_subtopic, "query"] = df[~rows_wo_subtopic].sub_topic

    # The BGE M3 expects a list of values
    df["pos"] = df["pos"].apply(lambda x: [x])
    df = df.loc[:, ["query", "pos", "neg"]]
    seed = 42
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_json("dataset.jsonl", orient="records", lines=True)
    print(df.info())

    train_df, eval_df = train_test_split(df, test_size=0.1, random_state=seed, shuffle=True)
    train_df.to_json("train_dataset.jsonl", orient="records", lines=True)
    eval_df.to_json("eval_dataset.jsonl", orient="records", lines=True)


def make_dataset_v3(df, duplicate_rows=False, filename="train_dataset.jsonl"):
    """In this version of make dataset, no longer split into train and eval, because eval and test datasets are currently gotten from wura."""
    df_count = len(df)
    df["neg"] = None
    def pick_negative_values(row):
        picked = False
        neg = row.neg
        if not neg:
            size = 7
            neg = []
        else:
            neg = [neg]
            size = 6

        while not picked:
            indexes = np.random.choice(df_count, size=size, replace=False)
            if row.name not in indexes:
                picked = True

        new_neg = neg + df.iloc[indexes].pos.tolist()
        return new_neg

    df.rename(columns={"text": "pos", "title": "query"}, inplace=True)
    df["neg"] = df.apply(lambda row: pick_negative_values(row), axis=1)
    # Extracting subtopics and using them as a query in duplicate rows
    rows_wo_subtopic = df["sub_topic"].isna()
    if duplicate_rows:
        sub_topic_df = df[~rows_wo_subtopic].copy()
        sub_topic_df.loc[:, "query"] = sub_topic_df.loc[:, "sub_topic"]
        df = pd.concat([df, sub_topic_df])
    else:
        df.loc[~rows_wo_subtopic, "query"] = df[~rows_wo_subtopic].sub_topic

    # The BGE M3 expects a list of values
    df["pos"] = df["pos"].apply(lambda x: [x])
    df = df.loc[:, ["query", "pos", "neg"]]
    seed = 42
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_json(filename, orient="records", lines=True)

In [12]:
def make_dataset():
    # masakhanews_1k.tsv is from Masakhanews
    df1 = pd.read_csv('masakhanews_1k.tsv', delimiter="\t").drop_duplicates(["headline", "text"])
    df1.dropna(inplace=True)
    df1.rename(columns={'headline': 'query', 'text': 'pos'}, inplace=True)
    df1.drop(columns=["category", "url"], inplace=True)
    df1["neg"] = None

    # alaroye_mato_10k.tsv is from AbdulMatin's crawl of Alaroye
    df2 = pd.read_csv('alaroye_mato_10k.tsv', delimiter="\t").drop_duplicates(["Url"])
    df2.dropna(inplace=True)
    df2.rename(columns={'Title': 'query', 'Text': 'pos'}, inplace=True)
    df2.drop(columns=["Url"], inplace=True)
    df2["neg"] = None

    # von_mato_6k.tsv is from AbdulMatin's crawl of VON
    df3 = pd.read_csv('von_mato_6k.tsv', delimiter="\t").drop_duplicates(["link"])
    df3.dropna(inplace=True)
    df3.rename(columns={'sub_topic': 'query', 'text': 'pos'}, inplace=True)
    df3.drop(columns=["title", "link"], inplace=True)
    df3["neg"] = None

    df = pd.concat([df1, df2, df3])

    df_count = len(df)
    def pick_negative_values(row):
        picked = False
        neg = row.neg
        if not neg:
            size = 7
            neg = []
        else:
            neg = [neg]
            size = 6

        while not picked:
            indexes = np.random.choice(df_count, size=size, replace=False)
            if row.name not in indexes:
                picked = True

        new_neg = neg + df.iloc[indexes].pos.tolist()
        return new_neg

    # Apply function to each row
    seed = 42
    df["neg"] = df.apply(lambda row: pick_negative_values(row), axis=1)
    df["pos"] = df["pos"].apply(lambda x: [x])
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    df.to_json("dataset.jsonl", orient="records", lines=True)
    print(df.info())

    train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, shuffle=True)
    train_df.to_json("train_dataset.jsonl", orient="records", lines=True)

    test_df, eval_df = train_test_split(test_df, test_size=0.5, random_state=seed, shuffle=True)
    eval_df.to_json("eval_dataset.jsonl", orient="records", lines=True)
    test_df.to_json("test_dataset.jsonl", orient="records", lines=True)

In [13]:
import shutil


def combine_wura_with_all_mato_igbo():
    """Just adding igbo data to yoruba's train data, to see if it improves quality of yoruba data."""
    # Creates train_dataset.jsonl, dataset.jsonl and eval_dataset.jsonl. But dataset.jsonl is the important one.
    make_dataset_v2(make_igbo_df_v0())
    # Overwrite the train_dataset.jsonl
    combine_wura_train = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/combined_wura/train_dataset.jsonl"
    shutil.copyfile(combine_wura_train, "train_dataset.jsonl")

    data = []
    with jsonlines.open("train_dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)
    with jsonlines.open("dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)

    import random

    random.seed(42)
    random.shuffle(data)

    with jsonlines.open("train_dataset.jsonl", "w") as writer:
        writer.write_all(data)


def combine_wura_with_all_mato_igbo_hausa():
    """Just adding igbo+hausa data to yoruba's train data, to see if it improves quality of yoruba data."""
    # Creates train_dataset.jsonl, dataset.jsonl and eval_dataset.jsonl. But dataset.jsonl is the important one.
    df = pd.concat([make_igbo_df_v0(), make_hausa_df_v0()])
    make_dataset_v2(df)
    # Overwrite the train_dataset.jcomsonl
    combine_wura_train = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/combined_wura/train_dataset.jsonl"
    shutil.copyfile(combine_wura_train, "train_dataset.jsonl")

    data = []
    with jsonlines.open("train_dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)
    with jsonlines.open("dataset.jsonl") as reader:
        for obj in reader:
            data.append(obj)

    import random

    random.seed(42)
    random.shuffle(data)

    with jsonlines.open("train_dataset.jsonl", "w") as writer:
        writer.write_all(data)

    hausa_igbo_comwura_train = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/hausa_igbo_comwura/train_dataset.jsonl"
    shutil.copyfile("train_dataset.jsonl", hausa_igbo_comwura_train)


def make_incremental_igbo_hausa_eval_datasets():
    """Samples from the hausa and igbo wura train datasets"""
    def make_incremental(lang_id):
        data = load_dataset("castorini/wura", lang_id, level="document", verification_mode="no_checks", trust_remote_code=True)
        dataset = make_wura_df(data["train"])

        random.seed(SEED)
        eval_idxs = random.sample(range(len(dataset)), 2000)
        eval_dataset = dataset.iloc[eval_idxs]

        eval_dataset.rename(columns={"text": "pos", "title": "query"}, inplace=True)
        eval_dataset.to_json(f"{lang_id}_eval_dataset.jsonl", orient="records", lines=True)

    make_incremental("ibo")
    make_incremental("hau")

    eval_dataset = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/igbo/eval_dataset.jsonl"
    shutil.copyfile("ibo_eval_dataset.jsonl", eval_dataset)

    eval_dataset = "/content/drive/MyDrive/Side Projects/NaijEmbeddings/datasets/hausa/eval_dataset.jsonl"
    shutil.copyfile("hau_eval_dataset.jsonl", eval_dataset)

In [15]:
import json
import uuid
from pathlib import Path
import jsonlines


def text_to_guid(text: str) -> str:
    """
    Generate a deterministic GUID (UUID v5) from a given text.
    """
    namespace = uuid.NAMESPACE_DNS  # Standard namespace, or use your own UUID
    return str(uuid.uuid5(namespace, text))


def format_evaluation_jsonl(filepath):
    filepath = Path(filepath)
    lines = []
    with jsonlines.open(filepath) as reader:
        for obj in reader:
            lines.append(obj)


    dataset = {"queries": {}, "corpus": {}, "relevant_docs": {}, "mode": "text"}

    for line in lines:
        query_id = text_to_guid(line["query"])
        if isinstance(line["pos"], str):
            pos = line["pos"]
        elif isinstance(line["pos"], list):
            pos = line["pos"][0]
        else:
            raise ValueError(f"Unexpected type for 'pos': {type(line['pos'])}. Expected a list or string.")

        pos_id = text_to_guid(pos)
        dataset["queries"][query_id] = line["query"]
        dataset["corpus"][pos_id] = pos
        dataset["relevant_docs"][query_id] = [pos_id]

    new_path = filepath.parent / (filepath.stem + "_formatted.json")
    with open(new_path, "w") as f:
        json.dump(dataset, f, indent=4)

In [16]:
# df = split_wura_validation_all_langs()

# !cp hau_eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/hausa/eval_dataset.jsonl
# !cp hau_test_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/hausa/test_dataset.jsonl

# !cp igbo_eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/igbo/eval_dataset.jsonl
# !cp igbo_test_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/igbo/test_dataset.jsonl

# !cp yor_eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/yoruba/eval_dataset.jsonl
# !cp yor_test_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/static_wura/yoruba/test_dataset.jsonl

In [18]:
# make_dataset_v3(make_yoruba_df(), filename="yor_train_dataset.jsonl")
# make_dataset_v3(make_igbo_df(), filename="igbo_train_dataset.jsonl")
make_dataset_v3(make_hausa_df(), filename="hausa_train_dataset.jsonl")
# format_evaluation_jsonl("eval_dataset.jsonl")

# OR


# Download dataset
# !gdown https://drive.google.com/uc?id=1xJ6EHSyaZeMtosQ7RF_R9OHJssuXl0Eq

!gdown https://drive.google.com/uc?id=1qR1n_kb5mtCfbAPitw3bffRKQtN0H-ZL


!gdown https://drive.google.com/uc?id=10RHg1qWjopgjo0Ns0TZ53zhhAmVuO6u4

Generating train split: 100%|██████████| 359881/359881 [00:19<00:00, 18891.78 examples/s]
Generating validation split: 100%|██████████| 39986/39986 [00:02<00:00, 19208.35 examples/s]


FileNotFoundError: [Errno 2] No such file or directory: 'hausa_mato_81k.tsv'

In [6]:
!gdown https://drive.google.com/uc?id=1M1YTH2jYJ6zL8T_k4Icxe7RkwlBcBB9d

Downloading...
From (original): https://drive.google.com/uc?id=1M1YTH2jYJ6zL8T_k4Icxe7RkwlBcBB9d
From (redirected): https://drive.google.com/uc?id=1M1YTH2jYJ6zL8T_k4Icxe7RkwlBcBB9d&confirm=t&uuid=26d7e3f1-a605-4688-acea-03c9a7cc58d2
To: /home/omotoso.abdulmatin4/filtered_yoruba_train_dataset.jsonl
100%|█████████████████████████████████████████| 307M/307M [00:01<00:00, 187MB/s]


In [None]:
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/hausa_train_dataset.jsonl .
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/igbo_train_dataset.jsonl .
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/yoruba_train_dataset.jsonl .

In [None]:
data = []

with jsonlines.open("yoruba_train_dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

with jsonlines.open("hausa_train_dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

with jsonlines.open("igbo_train_dataset.jsonl") as reader:
    for obj in reader:
        data.append(obj)

In [None]:
sizes = [10_000, 50_000, 100_000, 300_000]

for size in sizes:
    with jsonlines.open(f"{size}_train_dataset.jsonl", "w") as writer:
        writer.write_all(data[:size])

# 10k
gdown https://drive.google.com/uc?id=1qR1n_kb5mtCfbAPitw3bffRKQtN0H-ZL
# 50k
gdown https://drive.google.com/uc?id=1-2UiPWc6Z0Qn0coN1yYIgOSciNFcEncB
# 100k
gdown https://drive.google.com/uc?id=1-4WNTv69iQR528_lS7iKQxo24jkBnECr

In [None]:
!cp 10000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/10000_train_dataset.jsonl
!cp 50000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/50000_train_dataset.jsonl
!cp 100000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/100000_train_dataset.jsonl
!cp 300000_train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combine_wura_all_langs/300000_train_dataset.jsonl

In [None]:
!ls /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/combine_wura_all_langs/

In [None]:
len(data)

In [None]:
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/hausa_mato_81k.tsv .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/igbo_mato_3k.tsv .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/alaroye_mato_10k.tsv .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/original_datasets/von_mato_6k.tsv .

In [None]:
# !cp train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets
# !cp eval_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets
# !cp train_dataset.jsonl /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/igbo_comwura/
!cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/hausa/eval_dataset.jsonl .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/hausa_igbo_comwura/train_dataset.jsonl .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/combined_wura/eval_dataset.jsonl .
# !cp /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/datasets/test_dataset.jsonl .

In [None]:
format_evaluation_jsonl("eval_dataset.jsonl")

In [7]:
!wget https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/refs/heads/master/examples/finetune/ds_stage0.json

--2025-04-18 16:53:03--  https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/refs/heads/master/examples/finetune/ds_stage0.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 963 [text/plain]
Saving to: ‘ds_stage0.json’


2025-04-18 16:53:03 (105 MB/s) - ‘ds_stage0.json’ saved [963/963]



In [8]:
torchrun --standalone --nproc_per_node 1 \
-m FlagEmbedding.finetune.embedder.encoder_only.m3 \
--model_name_or_path BAAI/bge-m3 \
--output_dir ./bge-m3 \
--cache_dir ./cache/model \
--cache_path ./cache/data \
--train_data ./filtered_yoruba_train_dataset.jsonl_train_dataset.jsonl \
--trust_remote_code True \
--train_group_size 2 \
--query_max_len 512 \
--passage_max_len 2048 \
--overwrite_output_dir \
--learning_rate 1e-5 \
--fp16 \
--dataloader_num_workers 12 \
--gradient_checkpointing \
--deepspeed ds_stage0.json \
--num_train_epochs 3 \
--per_device_train_batch_size 160 \
--dataloader_drop_last False \
--warmup_ratio 0.1 \
--report_to none \
--logging_steps 100 \
--save_steps 500 \
--temperature 0.01 \
--sentence_pooling_method cls \
--normalize_embeddings True \
--knowledge_distillation False \
--kd_loss_type m3_kd_loss \
--unified_finetuning False \
--use_self_distill False \
--fix_encoder False

SyntaxError: invalid syntax (1272319703.py, line 1)

In [3]:
# # Train a model, terminal command
import re

command = """
torchrun --standalone --nproc_per_node 4 \
-m FlagEmbedding.finetune.embedder.encoder_only.m3 \
--model_name_or_path BAAI/bge-m3 \
--output_dir ./bge-m3 \
--cache_dir ./cache/model \
--cache_path ./cache/data \
--train_data /home/omotoso.abdulmatin4/filtered_yoruba_train_dataset.jsonl \
--trust_remote_code True \
--train_group_size 2 \
--query_max_len 512 \
--passage_max_len 2048 \
--overwrite_output_dir \
--learning_rate 1e-5 \
--fp16 \
--dataloader_num_workers 12 \
--gradient_checkpointing \
--deepspeed ds_stage0.json \
--num_train_epochs 3 \
--per_device_train_batch_size 8 \
--dataloader_drop_last False \
--warmup_ratio 0.1 \
--report_to none \
--logging_steps 100 \
--save_steps 500 \
--temperature 0.01 \
--sentence_pooling_method cls \
--normalize_embeddings True \
--knowledge_distillation False \
--kd_loss_type m3_kd_loss \
--unified_finetuning False \
--use_self_distill False \
--fix_encoder False"""

command = re.sub(r'\\\n\s+', '', command)

print(command)

# OR

# Download existing model weights
# !gdown https://drive.google.com/uc?id=1hC2nReprpHpCNWq9yergzGJLSHz_VKia
# !tar -xzvf bge-m3-5-epochs-unified.tar.gz

#gdown https://drive.google.com/uc?id=1-2UiPWc6Z0Qn0coN1yYIgOSciNFcEncB
#gdown https://drive.google.com/uc?id=1-4WNTv69iQR528_lS7iKQxo24jkBnECr


torchrun --standalone --nproc_per_node 4 -m FlagEmbedding.finetune.embedder.encoder_only.m3 --model_name_or_path BAAI/bge-m3 --output_dir ./bge-m3 --cache_dir ./cache/model --cache_path ./cache/data --train_data /home/omotoso.abdulmatin4/filtered_yoruba_train_dataset.jsonl --trust_remote_code True --train_group_size 2 --query_max_len 512 --passage_max_len 2048 --overwrite_output_dir --learning_rate 1e-5 --fp16 --dataloader_num_workers 12 --gradient_checkpointing --deepspeed ds_stage0.json --num_train_epochs 3 --per_device_train_batch_size 8 --dataloader_drop_last False --warmup_ratio 0.1 --report_to none --logging_steps 100 --save_steps 500 --temperature 0.01 --sentence_pooling_method cls --normalize_embeddings True --knowledge_distillation False --kd_loss_type m3_kd_loss --unified_finetuning False --use_self_distill False --fix_encoder False


In [None]:
!{command}



In [1]:
import torch
import torchvision
print(torch.__version__)
print(torchvision.__version__)


  from .autonotebook import tqdm as notebook_tqdm


2.1.0+cu118
0.16.0+cu118


In [None]:
!{command}

model_id = "bge-m3-hausaigbocomwura-2_95-epochs-e5-lr-0_1-warmup-32-batchsize-0_01-temperature-2-groupsize"
!tar --exclude='global_*' -czvf {model_id}.tar.gz ./bge-m3/checkpoint-12500
!cp {model_id}.tar.gz /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/experiments/model_weights/

model_id = "bge-m3-hausaigbocomwura-3-epochs-e5-lr-0_1-warmup-32-batchsize-0_01-temperature-2-groupsize"
!tar --exclude='./bge-m3/checkpoint-*' -czvf {model_id}.tar.gz ./bge-m3
!cp {model_id}.tar.gz /content/drive/MyDrive/Side\ Projects/NaijEmbeddings/experiments/model_weights/