In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
import faiss

In [None]:
corpus = load_dataset("igzi/pile-stem-corpus", split="train")
queries = load_dataset("igzi/MNLP_M2_mcqa_dataset", split="train")

In [2]:
corpus_chunks = [doc["text"] for doc in corpus]

In [3]:
# Step 2: Extract texts
mmlu_queries = []
arc_queries = []
mathqa_queries =[]
scienceqa_queries = []

for query in queries:
    if query["dataset"] == "kz919/mmlu-auxiliary-train-auto-labelled":
        mmlu_queries.append(query["question"])
    elif query["dataset"] == "allenai/ai2_arc" and "arc_easy_" in query["id"]:
        arc_queries.append(query["question"])
    elif query["dataset"] == "derek-thomas/ScienceQA":
        scienceqa_queries.append(query["question"])
    elif query["dataset"] == "allenai/math_qa":
        mathqa_queries.append(query["question"])

In [4]:
import random

random.seed(42)
random.shuffle(mmlu_queries)
random.shuffle(arc_queries)
random.shuffle(scienceqa_queries)
random.shuffle(mathqa_queries)

# Sample specified number of queries from each
final_queries = (
    mmlu_queries[:300] +
    arc_queries[:300] +
    scienceqa_queries[:150] +
    mathqa_queries[:150]
)

print(f"Total combined queries: {len(final_queries)}")

Total combined queries: 900


In [5]:
model = SentenceTransformer("BAAI/bge-small-en-v1.5")

In [6]:
def heuristic_filter(chunks, tokenizer, min_tokens=50, max_tokens=400):
    def is_valid(chunk):
        tokenized = tokenizer(chunk, truncation=False, add_special_tokens=False)
        length = len(tokenized["input_ids"])
        if not (min_tokens <= length <= max_tokens):
            return False
        if len(set(chunk.split())) / len(chunk.split()) < 0.5:  # still use for redundancy
            return False
        return True

    return [chunk for chunk in chunks if is_valid(chunk)]

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
filtered_chunks = heuristic_filter(corpus_chunks, tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors


In [8]:
filtered_chunks_set = set(filtered_chunks)

In [9]:
filtered_corpus = []
for chunk in corpus:
    if chunk["text"] in filtered_chunks_set:
        filtered_corpus.append(chunk)

In [None]:
from datasets import Dataset, load_dataset

# Step 1: Create a Hugging Face Dataset object
filtered_dataset = Dataset.from_list(filtered_corpus)

In [None]:
# Step 2: Save locally
filtered_dataset.save_to_disk("pile-stem-corpus-filtered")

In [None]:
# Step 3: (Optional) Push to Hugging Face Hub
filtered_dataset.push_to_hub("igzi/pile-stem-corpus-filtered")

In [3]:
import torch

device = torch.device("cuda")
model = SentenceTransformer("BAAI/bge-small-en-v1.5")
model = model.to(device)

In [4]:
filtered_data = load_dataset("igzi/pile-stem-corpus-filtered", split="train")