In [1]:
import os, re, random, itertools
from datasets import load_dataset, Dataset, concatenate_datasets
from huggingface_hub import HfApi

In [2]:
HF_TEAM  = "GingerBled"            # ← replace
HF_REPO_ID    = f"{HF_TEAM}/MNLP_M2_RAG_retriever_triples"
SEED          = 42
random.seed(SEED)

In [36]:
def clean(txt: str) -> str:
    """Collapse whitespace & strip."""
    return re.sub(r"\s+", " ", txt).strip()

def build_negatives(records):
    """Add 'neg' by sampling a positive passage from another triple."""
    positives = [r["pos"] for r in records]
    for r in records:
        neg = random.choice(positives)
        while neg == r["pos"]:
            neg = random.choice(positives)
        r["neg"] = neg
    return records

def add_negatives(records):
    pool = [r["pos"] for r in records]
    for r in records:
        neg = random.choice(pool)
        while neg == r["pos"]:
            neg = random.choice(pool)
        r["neg"] = neg
    return records


In [32]:
from bs4 import BeautifulSoup
from tqdm.auto import tqdm


def triples_sciq(k=12_500):
    ds = load_dataset("allenai/sciq", split="train").shuffle(seed=SEED)
    triples = []
    LETTERS = "ABCD"
    for ex in ds:
        choices = [
            ex["correct_answer"],
            ex["distractor1"],
            ex["distractor2"],
            ex["distractor3"],
        ]

        opts_block = "\n".join(
            f"({LETTERS[i]}) {c}" for i, c in enumerate(choices)
        )
        query_text = (
            f"{ex['question'].strip()}\n"
            f"{opts_block}\n"
            "### Answer:"
        )
        
        random.shuffle(choices)
        triples.append(
            {"query": clean(query_text),
             "pos":   clean(ex["support"]),      # evidence paragraph
             "source": "sciq"}
        )
    return triples

def triples_qasc(n=8000):
    ds = load_dataset("allenai/qasc", split="train").shuffle(seed=SEED).select(range(n))
    return [
        {"query": clean(ex["formatted_question"]),
         "pos":   clean(ex["combinedfact"]),
         "source": "qasc"}
        for ex in ds
    ]

def extract_html_span(html_doc, start_byte, end_byte):
    """Return a clean text span from raw HTML by byte offsets."""
    span_html = html_doc[start_byte:end_byte]
    return clean(BeautifulSoup(span_html, "lxml").get_text(" ", strip=True))

from datasets import load_dataset, Dataset
import re, random
from tqdm.auto import tqdm

SEED = 42
random.seed(SEED)
KW = re.compile(
    r"\b(math|physics|chem|biology|algorithm|computer|calculus|integral|electron|force|proof|code|compute)\b",
    re.I,
)
TAG_RE = re.compile(r"<[^>]+>")          # fast HTML stripper

def triples_nq(n=20_000):
    """
    Build ≤ n STEM triples from natural_questions quickly.
    """
    # 1) load fully (fits in RAM) and parallel filter on title + question
    ds = load_dataset("natural_questions", split="train")
    ds = ds.filter(
        lambda x: KW.search(
            f"{x['document']['title']} {x['question']['text']}"
        )
        is not None,
        num_proc=4,
    ).shuffle(seed=SEED)

    triples, bar = [], tqdm(total=n, desc="NQ triples")

    for ex in ds:
        la = ex["annotations"]["long_answer"][0]
        if la["start_byte"] == -1:
            continue
        html_doc = ex["document"]["html"]
        snippet = html_doc[la["start_byte"] : la["end_byte"]]
        passage = TAG_RE.sub(" ", snippet).strip()

        if len(passage) < 30:
            continue

        triples.append(
            {
                "query":  ex["question"]["text"].strip(),
                "pos":    passage,
                "source": "natural_questions",
            }
        )

        bar.update(1)
        if len(triples) >= n:
            break

    bar.update(n - bar.n); bar.close()
    return triples


In [15]:
triples = triples_sciq() + triples_qasc()

In [34]:
len(nq)

1334

In [33]:
print("Collecting triples ...")
nq = triples_nq()
triples = triples + nq


Collecting triples ...


Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/287 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/287 [00:00<?, ?files/s]

train-00020-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00021-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00022-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00023-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00024-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00025-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00026-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00027-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00028-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00029-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00030-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00031-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00032-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00033-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00034-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00035-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00036-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00037-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00038-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00039-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00040-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00041-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00042-of-00287.parquet:   0%|          | 0.00/180M [00:00<?, ?B/s]

train-00043-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00044-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00045-of-00287.parquet:   0%|          | 0.00/182M [00:00<?, ?B/s]

train-00046-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00047-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00048-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00049-of-00287.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00050-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00051-of-00287.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00052-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00053-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00054-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00055-of-00287.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00056-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00057-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00058-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00059-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00060-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00061-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00062-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00063-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00064-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00065-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00066-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00067-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00068-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00069-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00070-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00071-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00072-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00073-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00074-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00075-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00076-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00077-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00078-of-00287.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00079-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00080-of-00287.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

train-00081-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00082-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00083-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00084-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00085-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00086-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00087-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00088-of-00287.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

train-00089-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00090-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00091-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00092-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00093-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00094-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00095-of-00287.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00096-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00097-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00098-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00099-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00100-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00101-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00102-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00103-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00104-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00105-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00106-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00107-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00108-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00109-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00110-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00111-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00112-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00113-of-00287.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00114-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00115-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00116-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00117-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00118-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00119-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00120-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00121-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00122-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00123-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00124-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00125-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00126-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00127-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00128-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00129-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00130-of-00287.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00131-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00132-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00133-of-00287.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

train-00134-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00135-of-00287.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00136-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00137-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00138-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00139-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00140-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00141-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00142-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00143-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00144-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00145-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00146-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00147-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00148-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00149-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00150-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00151-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00152-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00153-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00154-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00155-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00156-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00157-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00158-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00159-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00160-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00161-of-00287.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00162-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00163-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00164-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00165-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00166-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00167-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00168-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00169-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00170-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00171-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00172-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00173-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00174-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00175-of-00287.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

train-00176-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00177-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00178-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00179-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00180-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00181-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00182-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00183-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00184-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00185-of-00287.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00186-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00187-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00188-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00189-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00190-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00191-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00192-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00193-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00194-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00195-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00196-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00197-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00198-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00199-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00200-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00201-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00202-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00203-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00204-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00205-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00206-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00207-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00208-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00209-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00210-of-00287.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00211-of-00287.parquet:   0%|          | 0.00/209M [00:00<?, ?B/s]

train-00212-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00213-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00214-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00215-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00216-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00217-of-00287.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00218-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00219-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00220-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00221-of-00287.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

train-00222-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00223-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00224-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00225-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

train-00226-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00227-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00228-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00229-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00230-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00231-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00232-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00233-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00234-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00235-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00236-of-00287.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00237-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00238-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00239-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00240-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00241-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00242-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00243-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00244-of-00287.parquet:   0%|          | 0.00/192M [00:00<?, ?B/s]

train-00245-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00246-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00247-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00248-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00249-of-00287.parquet:   0%|          | 0.00/202M [00:00<?, ?B/s]

train-00250-of-00287.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00251-of-00287.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

train-00252-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00253-of-00287.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00254-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00255-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00256-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00257-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00258-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00259-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00260-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00261-of-00287.parquet:   0%|          | 0.00/186M [00:00<?, ?B/s]

train-00262-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00263-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00264-of-00287.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

train-00265-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00266-of-00287.parquet:   0%|          | 0.00/198M [00:00<?, ?B/s]

train-00267-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00268-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00269-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00270-of-00287.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

train-00271-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00272-of-00287.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

train-00273-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00274-of-00287.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

train-00275-of-00287.parquet:   0%|          | 0.00/199M [00:00<?, ?B/s]

train-00276-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00277-of-00287.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

train-00278-of-00287.parquet:   0%|          | 0.00/201M [00:00<?, ?B/s]

train-00279-of-00287.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00280-of-00287.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

train-00281-of-00287.parquet:   0%|          | 0.00/182M [00:00<?, ?B/s]

train-00282-of-00287.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

train-00283-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00284-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00285-of-00287.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00286-of-00287.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

validation-00000-of-00007.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

validation-00001-of-00007.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

validation-00002-of-00007.parquet:   0%|          | 0.00/189M [00:00<?, ?B/s]

validation-00003-of-00007.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

validation-00004-of-00007.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

validation-00005-of-00007.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

validation-00006-of-00007.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/307373 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7830 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/235 [00:00<?, ?it/s]

Filter (num_proc=4):   0%|          | 0/307373 [00:00<?, ? examples/s]

NQ triples:   0%|          | 0/20000 [00:00<?, ?it/s]

NameError: name 'add_negatives' is not defined

In [37]:
print(len(nq))

1334


In [38]:
triples = add_negatives(triples)
random.shuffle(triples)

In [39]:
ds = Dataset.from_list(triples)
print(ds, ds[0])

Dataset({
    features: ['query', 'pos', 'source', 'neg'],
    num_rows: 21013
}) {'query': "Why do bears hibernate? (A) use greater resources (B) one celled organisms (C) they are lazy (D) conserve energy (E) When it's cold (F) hard outer covering (G) hydration (H) more sleep", 'pos': 'bears hibernate to conserve energy', 'source': 'qasc', 'neg': 'Learned behaviors are adaptive because they are flexible. They can change if the environment changes.'}


In [41]:
os.environ["HF_TOKEN"] ='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'

In [43]:
print(f"Pushing dataset to the Hub at {HF_REPO_ID}")
api = HfApi()
if not api.repo_exists(HF_REPO_ID, repo_type="dataset"):
    api.create_repo(HF_REPO_ID, repo_type="dataset", private=False)

ds.push_to_hub(HF_REPO_ID, private=False)
print("✅  Push complete")

Pushing dataset to the Hub at GingerBled/MNLP_M2_RAG_retriever_triples


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

✅  Push complete
