In [25]:
from datasets import load_dataset, Dataset
import numpy as np
from tqdm import tqdm

In [26]:
corpus = load_dataset("igzi/pile-stem-corpus-filtered-embedded", split="train")

In [27]:
subset_size = 250000
subset = corpus.select(range(subset_size))

In [28]:
embeddings_wiki = []
text_wiki = []
embeddings_stackexchange = []
text_stackexchange = []
embeddings_textbook = []
text_textbook = []

for chunk in subset:
    if chunk["source"] == "https://huggingface.co/datasets/milkshake721/2.1M-wiki-STEM":
        embeddings_wiki.append(chunk["embedding"])
        text_wiki.append(chunk["text"])
    elif chunk["source"] == "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_math_jsonl":
        embeddings_stackexchange.append(chunk["embedding"])
        text_stackexchange.append(chunk["text"])
    else:
        embeddings_textbook.append(chunk["embedding"])
        text_textbook.append(chunk["text"])

In [29]:
from sklearn.preprocessing import normalize

embeddings_wiki_norm = normalize(np.array(embeddings_wiki))
embeddings_stackexchange_norm = normalize(np.array(embeddings_stackexchange))
embeddings_textbook_norm = normalize(np.array(embeddings_textbook))

In [30]:
print(f"Wiki: {len(embeddings_wiki)}")
print(f"Stackexchange: {len(embeddings_stackexchange)}")
print(f"Textbook: {len(embeddings_textbook)}")

Wiki: 190534
Stackexchange: 49001
Textbook: 10465


In [None]:
from sklearn.cluster import KMeans
import numpy as np

num_clusters_wiki = 750
kmeans = KMeans(n_clusters=num_clusters_wiki, random_state=42, n_init="auto")
cluster_labels_wiki = kmeans.fit_predict(embeddings_wiki)

In [None]:
num_clusters_stackexchange = 200
kmeans = KMeans(n_clusters=num_clusters_stackexchange, random_state=42, n_init="auto")
cluster_labels_stackexchange = kmeans.fit_predict(embeddings_stackexchange)

In [None]:
num_clusters_textbook = 50
kmeans = KMeans(n_clusters=num_clusters_textbook, random_state=42, n_init="auto")
cluster_labels_textbook = kmeans.fit_predict(embeddings_textbook)

In [None]:
SEPARATOR = "\n#{1,6} "

merged_wiki_docs = []
for cluster_id in range(num_clusters_wiki):
    # Get indices of texts in this cluster
    indices = np.where(cluster_labels_wiki == cluster_id)[0]
    # Combine texts with the separator
    merged_text = SEPARATOR + SEPARATOR.join([text_wiki[i] for i in indices])
    merged_wiki_docs.append(merged_text)

# Repeat for StackExchange
merged_stackexchange_docs = []
for cluster_id in range(num_clusters_stackexchange):
    indices = np.where(cluster_labels_stackexchange == cluster_id)[0]
    merged_text = SEPARATOR + SEPARATOR.join([text_stackexchange[i] for i in indices])
    merged_stackexchange_docs.append(merged_text)

# Repeat for Textbook
merged_textbook_docs = []
for cluster_id in range(num_clusters_textbook):
    indices = np.where(cluster_labels_textbook == cluster_id)[0]
    merged_text = SEPARATOR + SEPARATOR.join([text_textbook[i] for i in indices])
    merged_textbook_docs.append(merged_text)

In [None]:
from datasets import Dataset

# Prepare merged docs and sources
all_merged_docs = (
    merged_wiki_docs + merged_stackexchange_docs + merged_textbook_docs
)
all_sources = (
    ["https://huggingface.co/datasets/milkshake721/2.1M-wiki-STEM"] * len(merged_wiki_docs)
    + ["https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_math_jsonl"] * len(merged_stackexchange_docs)
    + ["https://huggingface.co/datasets/izumi-lab/open-text-books"] * len(merged_textbook_docs)
)

# Create HuggingFace Dataset
final_dataset = Dataset.from_dict({
    "text": all_merged_docs,
    "source": all_sources,
})

# Optionally, save to disk or push to hub
final_dataset.push_to_hub("igzi/pile-stem-corpus-small-semantic")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/311 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/igzi/pile-stem-corpus-small-semantic/commit/12c7f41eaacbd7592c85264781a203efc83dc475', commit_message='Upload dataset', commit_description='', oid='12c7f41eaacbd7592c85264781a203efc83dc475', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/igzi/pile-stem-corpus-small-semantic', endpoint='https://huggingface.co', repo_type='dataset', repo_id='igzi/pile-stem-corpus-small-semantic'), pr_revision=None, pr_num=None)