In [4]:
# install once:
# pip install datasets tqdm mwparserfromhell

from datasets import load_dataset
import re, mwparserfromhell, textwrap
from pathlib import Path
from tqdm.auto import tqdm

STEM_RE   = re.compile(r"\b(Mathematics|Physics|Computer[_ ]science|Chemistry|Engineering)\b", re.I)
OUT_DIR   = Path("rag_build/txt/wiki24")
OUT_DIR.mkdir(parents=True, exist_ok=True)

stream    = load_dataset("wikipedia", "20220301.en", split="train", streaming=True)
encoder   = str.maketrans({"’": "'", "–": "-", "“": '"', "”": '"'})

kept, bar = 0, tqdm(total=100000, desc="Wikipedia STEM pages")  # hard cap
with open(OUT_DIR / "stem_wiki.txt", "w", encoding="utf-8") as fh:
    for page in stream:
        if not STEM_RE.search(" ".join(page["categories"])):
            continue

        # basic cleanup: strip wiki markup → plain text
        text = mwparserfromhell.parse(page["text"]).strip_code()
        text = " ".join(text.split()).translate(encoder)

        if len(text) < 500:   # skip stubs
            continue

        fh.write(f"{page['title']}\n{text}\n\n")
        kept += 1
        bar.update(1)

        # stop at ~0.45 B tokens ≈ 250k articles avg 1800 tokens
        if kept >= 100000:
            break
bar.close()
print(f"Saved {kept} STEM pages → {OUT_DIR/'stem_wiki.txt'}")


Wikipedia STEM pages:   0%|          | 0/100000 [00:00<?, ?it/s]

Wikipedia STEM:   0%|          | 0/90000 [00:36<?, ?it/s]


KeyError: 'categories'

In [3]:
from datasets import load_dataset
import re, html, pathlib, tqdm

STEM_RE   = re.compile(r"^(mathematics|physics|computer_science|chemistry|engineering|"
                       r"electronics|statistics|astronomy|artificial_intelligence|"
                       r"machine_learning)", re.I)
MAX_ART   = 90_000
OUTFILE   = pathlib.Path("rag_build/txt/wiki/wiki_stem_2022.txt")
OUTFILE.parent.mkdir(parents=True, exist_ok=True)

# use the latest English config that exists on HF
ds = load_dataset("wikipedia", "20220301.en", streaming=True)

def is_stem(ex):
    cats = ex.get("categories", [])
    return any(STEM_RE.match(c) for c in cats)

bar, kept = tqdm.tqdm(total=MAX_ART, desc="Wikipedia STEM"), 0
with OUTFILE.open("w", encoding="utf-8") as fh:
    for ex in ds:
        if not is_stem(ex):
            continue
        text = html.unescape(ex["text"]).replace("\n\n", " ")
        fh.write(f"{ex['title']}\n\n{text}\n\n")
        kept += 1
        bar.update(1)
        if kept >= MAX_ART:
            break
bar.close()
print(f"✅  Saved {kept} articles to {OUTFILE}")


Wikipedia STEM:   0%|          | 0/90000 [00:00<?, ?it/s]

AttributeError: 'str' object has no attribute 'get'

In [12]:
#!/usr/bin/env python
"""
Collect a compact STEM slice from wikimedia/structured-wikipedia
-----------------------------------------------------------------
• Snapshot: en-20240201   (≈6 M English pages, JSON-LD)
• Keep a page if:
    1) any infobox name starts with a STEM string  OR
    2) any top-level section name matches STEM keywords   OR
    3) regex hits in the abstract text
• Stop after 100 k hits  (≈500 MB)
• Write "<title>\n\n<abstract>\n\n<body-paragraphs>\n\n" per page
  to  rag_build/txt/wiki/wiki_stem_struct_2024.txt
-----------------------------------------------------------------
Requires:   pip install datasets tqdm
"""

from datasets import load_dataset
from tqdm.auto import tqdm
import re, html, pathlib, json

# ───────── constants ────────────────────────────────────────────
SNAPSHOT      = "20240916.en"              # dataset config
MAX_PAGES     = 100_000
OUTFILE       = pathlib.Path("rag_build/txt/wiki/wiki_stem_struct_2024.txt")
OUTFILE.parent.mkdir(parents=True, exist_ok=True)

INFOBOX_KEYS  = (
    "infobox algorithm",
    "infobox planet",
    "infobox chemical",
    "infobox particle",
    "infobox scientist",
    "infobox physicist",
    "infobox mathematician",
    "infobox enzyme",
    "infobox computer",        # computer-related
    "infobox theorem",
    "infobox bridge",          # eng. structures
)

SECTION_RE    = re.compile(
    r"\b(mathematics|physics|computer|algorithm|chemistry|engineering|"
    r"calculus|algebra|proof|properties|applications|statistics|astronomy|AI|machinelearning|cryptography)\b", re.I
)

ABSTRACT_RE   = re.compile(
    r"\b(integral|derivative|matrix|quantum|electron|algorithm|n[- ]?p[- ]?complete|tensor|AI|gradient|cryptography)\b",
    re.I,
)

# ───────── helpers ──────────────────────────────────────────────
def has_stem_infobox(infoboxes):
    """
    True if any infobox name starts with a STEM prefix.
    Handles None values safely.
    """
    for box in infoboxes or []:
        if not isinstance(box, dict):
            continue
        name = (box.get("name") or "").lower()
        if name.startswith(INFOBOX_KEYS):
            return True
    return False

def has_stem_section(sections):
    """
    True if any top‑level section name matches the STEM regex.
    Handles None values safely.
    """
    for sec in sections or []:
        if not isinstance(sec, dict):
            continue
        if SECTION_RE.search((sec.get("name") or "")):
            return True
    return False

def extract_paragraphs(sections):
    """
    Recursively collect paragraph strings from the page's section tree.
    Handles None or missing 'has_parts'.
    """
    if not sections:
        return []

    stack, paras = list(sections), []
    while stack:
        node = stack.pop()
        if not isinstance(node, dict):
            continue
        if node.get("type") == "paragraph":
            paras.append(node.get("value", ""))
        # 'has_parts' may be None
        sub = node.get("has_parts") or []
        if isinstance(sub, list):
            stack.extend(sub)
    return paras

# ───────── stream & filter ──────────────────────────────────────
ds = load_dataset(
    "wikimedia/structured-wikipedia",
    SNAPSHOT,
    split="train",
    streaming=True,
)

bar, kept = tqdm(total=MAX_PAGES, desc="Structured-Wiki STEM"), 0
with OUTFILE.open("w", encoding="utf-8") as fh:
    for ex in ds:
        abstract_txt = ex.get("abstract") or ""        # turn None → ""
        if (
            has_stem_infobox(ex.get("infoboxes"))
            or has_stem_section(ex.get("sections"))
            or ABSTRACT_RE.search(abstract_txt)
        ):
            title = ex["name"].replace("_", " ")
            raw_paras = extract_paragraphs(ex.get("sections"))
            body      = "\n".join(p for p in raw_paras if isinstance(p, str) and p.strip())
            text_block = (
                f"{title}\n\n"
                f"{html.unescape(abstract_txt).strip()}\n\n"
                f"{html.unescape(body).strip()}\n\n"
            )
            fh.write(text_block)
            kept += 1
            bar.update(1)
            if kept >= MAX_PAGES:
                break

bar.close()
print(f"✅  Saved {kept} STEM articles → {OUTFILE}")


Structured-Wiki STEM:   0%|          | 0/100000 [00:00<?, ?it/s]

HfHubHTTPError: 429 Client Error: Too Many Requests for url: https://huggingface.co/datasets/wikimedia/structured-wikipedia/resolve/6dd690670ae7b807f31398bf63add1125c3bb25c/20240916.en/enwiki_namespace_0.zip