In [3]:
import hashlib, json, re, pathlib
from tqdm.auto import tqdm
from datasets import Dataset
import tiktoken

In [4]:
ENC = tiktoken.get_encoding("cl100k_base")
IN_FILE  = pathlib.Path("rag_build/clean/all_sources_norm.txt")
OUT_PATH = pathlib.Path("rag_build/chunks.parquet")

In [5]:
WINDOW  = 512
STEP    = 512 - 128  # overlap

In [6]:
rows, doc_id = [], 0
with IN_FILE.open("r", encoding="utf-8") as fh:
    for line in tqdm(fh, desc="chunking docs"):
        text = line.strip()
        if not text:
            continue

        # simple title heuristic: first 8 words before first '.' or '\n'
        title = " ".join(text.split()[:8])[:120]

        # detect source by prefix inserted earlier ("openstax" vs "wiki")
        source = "openstax" if "openstax" in title.lower() else "wikipedia"

        toks = ENC.encode(text)
        for i in range(0, len(toks), STEP):
            chunk_tokens = toks[i : i + WINDOW]
            if len(chunk_tokens) < 50:            # skip ultra-short tail
                continue
            chunk_text = ENC.decode(chunk_tokens)
            rows.append(
                {
                    "id": hashlib.sha1(chunk_text.encode()).hexdigest(),
                    "title": title,
                    "text": chunk_text,
                    "source": source,
                }
            )
        doc_id += 1

chunking docs: 0it [00:00, ?it/s]

In [7]:
print(f"Built {len(rows):,} chunks from {doc_id:,} documents")


Built 484,987 chunks from 1,010,924 documents


In [8]:
Dataset.from_list(rows).to_parquet(str(OUT_PATH))
print(f"Saved {OUT_PATH}")

Creating parquet from Arrow format:   0%|          | 0/485 [00:00<?, ?ba/s]

Saved rag_build/chunks.parquet


In [14]:
from datasets import load_dataset, Dataset, DatasetDict
import datasets, os, shutil, pathlib
from huggingface_hub import HfApi

api      = HfApi()
repo_id  = "GingerBled/RAG_corpus_docs"          # change if needed
private  = False   
os.environ["HF_TOKEN"] ='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'

# 1️⃣ create the repo only if it doesn’t exist
if not api.repo_exists(repo_id, repo_type="dataset"):
    api.create_repo(repo_id, repo_type="dataset", private=private, exist_ok=True)
    print("Repo created:", repo_id)
else:
    print("Repo already exists — skipping create.")

# 2️⃣ now push; this no longer triggers ‘create’ every run
ds = Dataset.from_parquet("rag_build/chunks.parquet")
ds.push_to_hub(repo_id, split="train", private=private)
print("✅ pushed dataset to", repo_id)

Repo created: GingerBled/RAG_corpus_docs


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/485 [00:00<?, ?ba/s]

✅ pushed dataset to GingerBled/RAG_corpus_docs
