In [40]:
from datasets import load_dataset, concatenate_datasets, Dataset, Features, Value, Sequence, ClassLabel
import pandas as pd
from collections import defaultdict
import random

In [41]:
# Load the OpenAssistant Conversations dataset
oasst1 = load_dataset("OpenAssistant/oasst1")

# Filter the training set to include only English-language entries
oasst1_eng = oasst1['train'].filter(lambda x: x.get("lang") == "en")

print(f"Number of English training examples in OpenAssistant Conversations dataset: {len(oasst1_eng)}")

Number of English training examples in OpenAssistant Conversations dataset: 39283


In [42]:
# Separate messages into: prompts (prompter role) and assistant replies
prompts_by_id = {}
assistant_replies_by_parent = defaultdict(list)

for ex in oasst1_eng:
    if ex["deleted"] or not ex["parent_id"]:
        continue
    if ex["role"] == "prompter":
        prompts_by_id[ex["message_id"]] = ex["text"]
    elif ex["role"] == "assistant" and isinstance(ex.get("rank"), int):
        assistant_replies_by_parent[ex["parent_id"]].append((ex["rank"], ex))

# Build DPO dataset: choose pairs with rank 0 (preferred) and rank 1 (second-best)
formatted_oasst = []

for parent_id, replies in assistant_replies_by_parent.items():
    if parent_id not in prompts_by_id:
        continue
    sorted_replies = sorted(replies, key=lambda x: x[0])
    if len(sorted_replies) >= 2:
        chosen = sorted_replies[0][1]
        rejected = sorted_replies[1][1]
        formatted_oasst.append({
            "id": f"oasst_{len(formatted_oasst)}",
            "dataset": "oasst",
            "prompt": prompts_by_id[parent_id].strip(),
            "chosen": chosen["text"].strip(),
            "rejected": rejected["text"].strip()
        })   

# Convert formatted_oasst to a HF Dataset
#randomly sample 5000 examples for demonstration
random.seed(42)
random.shuffle(formatted_oasst)
formatted_oasst = formatted_oasst[:5000] 
formatted_oasst_dataset = Dataset.from_list(formatted_oasst) 

In [43]:
formatted_oasst_dataset

Dataset({
    features: ['id', 'dataset', 'prompt', 'chosen', 'rejected'],
    num_rows: 4354
})

In [44]:
# List of selected STEM-related subreddits in Stanford's SHP dataset
subreddits = ["askscience", "askengineers", "askphysics", "askacademia"]

# Load and combine datasets
def load_shp_subset(subreddit):
    print(f"Loading SHP subset: {subreddit}")
    return load_dataset("stanfordnlp/SHP", data_dir=subreddit, split="train")

# 1. Formatter functions
def format_shp(example, idx):
    if example["labels"] == 1:
        chosen, rejected = example["human_ref_A"], example["human_ref_B"]
    else:
        chosen, rejected = example["human_ref_B"], example["human_ref_A"]
    return {
        "id": f"shp_{idx}",
        "dataset": "shp",
        "prompt": example["history"],
        "chosen": chosen.strip(),
        "rejected": rejected.strip()
    }

def format_shp_example(example):
    return {
        "id": example["id"],
        "source": "shp",
        "prompt": example["prompt"],
        "chosen": example["chosen"],
        "rejected": example["rejected"]
    }


# Load each and combine into one dataset
shp_datasets = [load_shp_subset(sub) for sub in subreddits]
shp = concatenate_datasets(shp_datasets)

# Optional: shuffle and preview
shp = shp.shuffle(seed=42)
print(f"✅ Combined SHP dataset size: {len(shp)}")
print(shp[0])

Loading SHP subset: askscience
Loading SHP subset: askengineers
Loading SHP subset: askphysics
Loading SHP subset: askacademia
✅ Combined SHP dataset size: 109226
{'post_id': 'jdu63w', 'domain': 'askacademia_train', 'upvote_ratio': 0.94, 'history': 'First year grad student feeling alone in the pandemic... Does it get better? Am I the only one? I (23F) graduated from undergrad in May and am currently attending grad school at the same institution. I\'m getting my MA in Spanish literature & language and pedagogy, expected to graduate Spring 2022. I also hold a TA position. Everything is operated over zoom this semester. I live in an apartment with my boyfriend that\'s walkable distance from campus.   The first month and a half of grad school has been pretty tough. I am absolutely swamped with work and have many days where I don\'t see the sun, spending the entire day inside. I do allow myself breaks, but at the end of the day, the workload is very overwhelming especially since I am both a

In [45]:
# Step 2: Sample from each: total 15k
shp_sample = shp.select(range(8000))
formatted_shp = shp_sample.map(format_shp, with_indices=True)
formatted_shp_cleaned = formatted_shp.map(format_shp_example, remove_columns=formatted_shp.column_names)

In [46]:
formatted_shp_cleaned

Dataset({
    features: ['id', 'prompt', 'chosen', 'rejected', 'source'],
    num_rows: 8000
})

In [47]:
# Load the dataset
tulu_raw = load_dataset("allenai/llama-3.1-tulu-3-8b-preference-mixture", split="train")

# Define scientific filter
def is_scientific(text):
    if not text:
        return False
    keywords = [
        "physics", "chemistry", "biology", "mathematics", "math", "medicine", "computer science",
        "algorithm", "neuron", "neural", "DNA", "protein", "experiment", "theorem", "hypothesis", "equation",
        "scientific", "research", "data", "model", "AI", "machine learning", "deep learning", "genetics",
        "cells", "quantum", "atomic", "engineering", "statistics", "proof", "calculus"
    ]
    return any(k in text.lower() for k in keywords)

# Filter examples that have both answers and are scientific
tulu_filtered = tulu_raw.filter(
    lambda x: x["chosen"] and x["rejected"] and is_scientific(x["prompt"])
)

# Limit to 8000 examples
tulu_limited = tulu_filtered.select(range(min(len(tulu_filtered), 8000)))

# Format fields and add source/id
tulu_dataset = Dataset.from_list([
    {
        "id": f"tulu_{i}",
        "source": "llama-3.1-tulu-3-8b",
        "prompt": ex["prompt"],
        "chosen": ex["chosen"][0],
        "rejected": ex["rejected"][0]
    }
    for i, ex in enumerate(tulu_limited)
])

# Preview
tulu_dataset

Dataset({
    features: ['id', 'source', 'prompt', 'chosen', 'rejected'],
    num_rows: 8000
})

In [48]:
def flatten_responses(dataset, source_name):
    return Dataset.from_list([
        {
            "id": ex["id"],
            "source": source_name,
            "prompt": ex["prompt"],
            "chosen": ex["chosen"]["content"] if isinstance(ex["chosen"], dict) else ex["chosen"],
            "rejected": ex["rejected"]["content"] if isinstance(ex["rejected"], dict) else ex["rejected"]
        }
        for ex in dataset
    ])

tulu_flat = flatten_responses(tulu_dataset, "tulu")

In [49]:
oasst_flat = flatten_responses(formatted_oasst_dataset, "oasst")

In [50]:
dpo_dataset = concatenate_datasets([tulu_flat, oasst_flat, formatted_shp_cleaned]).shuffle(seed=2025)
# Save locally
dpo_dataset.save_to_disk("dpo_dataset_mnlp")

print(f"✅ Final dataset size: {len(dpo_dataset)}")
print(dpo_dataset[0])

Saving the dataset (0/1 shards):   0%|          | 0/20354 [00:00<?, ? examples/s]

✅ Final dataset size: 20354
{'id': 'tulu_1859', 'source': 'tulu', 'prompt': 'Describe the multiple-step process of implementing electronic invoicing in retail businesses, including the necessary changes to infrastructure, technology, and personnel. Explain the potential benefits of electronic invoicing over traditional paper-based systems, taking into account factors such as improved efficiency, reduced costs, and enhanced data security. Additionally, provide an analysis of the economic and regulatory factors that affect the adoption of electronic invoicing, and discuss the challenges that retailers may face in transitioning to this new system. From Answer with one of the following options: a), b), c), d)', 'chosen': 'Describe the multiple-step process of implementing electronic invoicing in retail businesses, including the necessary changes to infrastructure, technology, and personnel. Explain the potential benefits of electronic invoicing over traditional paper-based systems, taking 

In [51]:
dpo_dataset
dpo_dataset_new = Dataset.from_list(dpo_dataset)

In [None]:
from huggingface_hub import login

TOKEN = "your_huggingface_token_here"
login(token=TOKEN)

In [54]:
for row in dpo_dataset_new:
    assert all(key in row for key in ["prompt", "chosen", "rejected"])
    assert isinstance(row["prompt"], str) and isinstance(row["chosen"], str) and isinstance(row["rejected"], str)

In [56]:
dpo_dataset_new.push_to_hub("GingerBled/MNLP_M2_DPO_dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/GingerBled/MNLP_M2_DPO_dataset/commit/e8f7db2f6659390f335faa589986b769dfdb5b57', commit_message='Upload dataset', commit_description='', oid='e8f7db2f6659390f335faa589986b769dfdb5b57', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/GingerBled/MNLP_M2_DPO_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='GingerBled/MNLP_M2_DPO_dataset'), pr_revision=None, pr_num=None)

In [58]:
from huggingface_hub import HfApi

api = HfApi()
api.delete_repo(repo_id="GingerBled/MNLP_M2_dpo_dataset", repo_type="dataset")

In [57]:
import json

with open("MNLP_M2_dpo_dataset.jsonl", "w") as f:
    for row in dpo_dataset_new:
        json.dump(row, f)
        f.write("\n")