In [47]:
from datasets import load_dataset, Dataset
import numpy as np
from tqdm import tqdm

In [48]:
corpus = load_dataset("igzi/pile-stem-corpus-filtered", split="train")

In [49]:
subset_size = 250000
subset = corpus.shuffle(seed=42).select(range(subset_size))

In [50]:
MAX_CHUNKS_PER_DOC = 500
SEPARATOR = "\n#{1,6} "

In [51]:
from collections import defaultdict
source_to_chunks = defaultdict(list)
for example in subset:
    source_to_chunks[example["source"]].append(example["text"])

In [52]:
def merge_chunks(chunks):
    """Yield merged documents from chunks, grouped in sets of MAX_CHUNKS_PER_DOC."""
    for i in range(0, len(chunks), MAX_CHUNKS_PER_DOC):
        chunk_group = chunks[i:i + MAX_CHUNKS_PER_DOC]
        # Strip whitespace and prepend '# ' to the first chunk
        cleaned = [chunk.strip() for chunk in chunk_group]
        merged = SEPARATOR + SEPARATOR.join(cleaned)
        yield merged

In [53]:
merged_documents = []
sources = []
for source, chunks in source_to_chunks.items():
    for merged_doc in merge_chunks(chunks):
        merged_documents.append({"source": source, "text": merged_doc})

In [54]:
merged_dataset = Dataset.from_list(merged_documents)

# Optional: Save locally
merged_dataset.save_to_disk("pile-stem-corpu-small")

Saving the dataset (0/1 shards):   0%|          | 0/502 [00:00<?, ? examples/s]

In [55]:
merged_dataset.push_to_hub("igzi/pile-stem-corpus-small")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/igzi/pile-stem-corpus-small/commit/7bc216ee046465153fcfb1f3fd446caf0580961d', commit_message='Upload dataset', commit_description='', oid='7bc216ee046465153fcfb1f3fd446caf0580961d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/igzi/pile-stem-corpus-small', endpoint='https://huggingface.co', repo_type='dataset', repo_id='igzi/pile-stem-corpus-small'), pr_revision=None, pr_num=None)

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from huggingface_hub import HfApi, HfFolder

# Config
source_model = "Qwen/Qwen3-0.6B-Base"
target_repo = "igzi/MNLP_M2_rag_model"

# Step 1: Download model and tokenizer
print(f"Loading model: {source_model}")
model = AutoModelForCausalLM.from_pretrained(source_model)
tokenizer = AutoTokenizer.from_pretrained(source_model)

# Step 2: Push to your own repo
print(f"Pushing to your Hugging Face repo: {target_repo}")
model.push_to_hub(target_repo)
tokenizer.push_to_hub(target_repo)

print("✅ Done.")

Loading model: Qwen/Qwen3-0.6B-Base


KeyboardInterrupt: 

In [None]:
from sentence_transformers import SentenceTransformer
from huggingface_hub import create_repo, upload_folder, HfApi
import shutil
import os

# Config
source_model = "BAAI/bge-small-en-v1.5"
target_repo = "igzi/MNLP_M2_document_encoder"
local_dir = "./bge_encoder_temp"

# Step 1: Download model
print(f"Loading model: {source_model}")
model = SentenceTransformer(source_model)

# Step 2: Save to local directory
if os.path.exists(local_dir):
    shutil.rmtree(local_dir)
model.save(local_dir)

# Step 3: Create repo (skip if it already exists)
print(f"Creating repo (if not already present): {target_repo}")
create_repo(repo_id=target_repo, repo_type="model", exist_ok=True)

# Step 4: Upload model
print(f"Uploading to: {target_repo}")
upload_folder(
    repo_id=target_repo,
    folder_path=local_dir,
    repo_type="model"
)

print("✅ Upload complete.")
