In [4]:
from datasets import load_dataset
import json
import random
from itertools import islice

In [5]:

# Load dataset (arXiv split)
# Load the streaming dataset
dataset_stream = load_dataset(
    "armanc/scientific_papers", name="arxiv", trust_remote_code=True, streaming=True
)

# Convert streaming generator to a list of N items (e.g. 10,000)
buffered = list(islice(dataset_stream["train"], 5000))  # Read a bit more for random sampling

# Randomly select 5,000 samples
subset = random.sample(buffered, 2500)

# Format into chat-style messages
formatted_data = []

In [9]:
import pandas as pd

# Convert to DataFrame for easier manipulation
df = pd.DataFrame(subset)
df.shape

(2500, 3)

In [17]:
df.head()

Unnamed: 0,article,abstract,section_names
0,weak decays of charmed and beautiful hadrons a...,@xmath0 decay is studied in the effective the...,introduction\nb @xmath9 decay matrix element\n...
1,light bridges ( lbs ) are bright structures in...,we report the discovery of supersonic downflo...,introduction\nobservations\nresults\nsummary a...
2,the spin-1/2 of a single electron trapped in a...,we study the two - qubit controlled - not gat...,introduction\nmodel\nstrong bias\ngeneral bias...
3,structural multi - factor _ economic capital _...,"a simple , yet reasonably accurate , analytic...",introduction\nbackground\nkiss model\nbenchmar...
4,"for tricritical phenomena , the highest dimens...",monte carlo simulations are used to investiga...,introduction\nbackground\nresults\nconclusions


In [12]:
for example in subset:
    input_text = example["article"]
    summary = example["abstract"]

    # Optional trimming of input if it's too long
    if len(input_text) > 3000:
        input_text = input_text[:3000] + "..."

    messages = [
        {
            "role": "system",
            "content": "You are a helpful research assistant that summarizes scientific papers."
        },
        {
            "role": "user",
            "content": f"Summarize the following research paper:\n\n{input_text}"
        },
        {
            "role": "assistant",
            "content": summary.strip()
        }
    ]

    formatted_data.append({"messages": messages})


In [14]:
print(formatted_data[:2])  # Display first 2 formatted examples

[{'messages': [{'role': 'system', 'content': 'You are a helpful research assistant that summarizes scientific papers.'}, {'role': 'user', 'content': "Summarize the following research paper:\n\nweak decays of charmed and beautiful hadrons are quite favorable in particle physics because of their usage in determining fundamental parameters of the standard model and testing various theories and models . among these heavy hadron decays the semileptonic decays @xmath4 and @xmath5 have been observed experimentally .\nthese exclusive decays provide one of the main channels to determine the important ckm matrix element @xmath1 .\nthe difficulty in studying @xmath6 and @xmath5 decays mainly concerns the calculation of the relevant hadronic matrix elements of weak operators , or , equivalently , the corresponding form factors which contain nonperturbative contributions as well as perturbative ones and are beyond the power of pure qcd perturbation theory . up to present these form factors are usua

In [18]:
# Save to JSON for training
with open("scientific_summarization_chat.json", "w", encoding="utf-8") as f:
    for item in formatted_data:
        f.write(json.dumps(item) + "\n")

print("✅ Dataset saved as scientific_summarization_chat.json")


✅ Dataset saved as scientific_summarization_chat.json
