In [14]:
# Set your paths and options
PARAPHRASED_PATH = "finetune_train.jsonl"   # <-- change to your file (jsonl or json)
OUT_JSONL = "paraphrased_10_sample.jsonl"
OUT_JSON  = "paraphrased_10_sample.json"

SAMPLE_SIZE = 10
SEED = 42  # for reproducibility

In [15]:
import json, os, random
from typing import List, Dict, Any

def read_json_maybe_jsonl(path: str) -> List[Dict[str, Any]]:
    """Loads either a JSON array or JSONL file into a list of dicts."""
    if path.lower().endswith(".jsonl"):
        rows = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    rows.append(json.loads(line))
                except Exception:
                    # skip malformed lines quietly
                    pass
        return rows
    else:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        if isinstance(data, dict):
            # If someone saved as {"root":[...]} etc., grab the longest list value
            best = None
            for v in data.values():
                if isinstance(v, list) and (best is None or len(v) > len(best)):
                    best = v
            if best is not None:
                return best
        raise ValueError("Unsupported JSON structure (need JSON array or JSONL).")

def write_jsonl(path: str, rows: List[Dict[str, Any]]) -> None:
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


In [16]:
# Load
data = read_json_maybe_jsonl(PARAPHRASED_PATH)
n = len(data)
print(f"Loaded {n:,} rows from {PARAPHRASED_PATH}")

# Sample (without replacement)
k = min(SAMPLE_SIZE, n)
random.seed(SEED)
sampled = random.sample(data, k) if k < n else data
print(f"Sampled {len(sampled):,} rows")

# Save JSONL
write_jsonl(OUT_JSONL, sampled)
print("Wrote JSONL ->", os.path.abspath(OUT_JSONL))

# Save JSON array (optional)
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(sampled, f, ensure_ascii=False, indent=2)
print("Wrote JSON  ->", os.path.abspath(OUT_JSON))


Loaded 21,426 rows from finetune_train.jsonl
Sampled 10 rows
Wrote JSONL -> /home/user/paraphrased_10_sample.jsonl
Wrote JSON  -> /home/user/paraphrased_10_sample.json


In [17]:
# Inspect a couple of examples
for i in range(min(3, len(sampled))):
    ex = sampled[i]
    keys = list(ex.keys())
    print(f"[{i}] keys: {keys}")
    print("instruction:", (ex.get("instruction") or "")[:120])
    print("paraphrased_output:", (ex.get("paraphrased_output") or "")[:120])
    print("---")


[0] keys: ['messages']
instruction: 
paraphrased_output: 
---
[1] keys: ['messages']
instruction: 
paraphrased_output: 
---
[2] keys: ['messages']
instruction: 
paraphrased_output: 
---
