In [7]:
# Paths
PARAPHRASED_PATH = "paraphrased_pruned_owl-final.jsonl"  # your paraphrased data (JSONL or JSON array)
OUT_TRAIN_JSONL  = "finetune_train.jsonl"          # output for OpenAI fine-tuning

# Which answer field to train on (usually paraphrased_output)
TARGET_FIELD = "paraphrased_output"  # or "original_output"

# Your prompt styles (exactly as provided)
PROMPT_WITH_INPUT = (
    """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
)

PROMPT_NO_INPUT = (
    """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""
)

In [8]:
import json, os
from typing import List, Dict, Any
from datasets import load_dataset

def read_json_maybe_jsonl(path: str) -> List[Dict[str, Any]]:
    """Loads either a JSON array or a JSONL file into a list of dicts."""
    if path.lower().endswith(".jsonl"):
        rows = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    rows.append(json.loads(line))
                except Exception:
                    pass
        return rows
    else:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        if isinstance(data, dict):
            # If someone saved as {"root":[...]} etc., pick the longest list value.
            best = None
            for v in data.values():
                if isinstance(v, List) and (best is None or len(v) > len(best)):
                    best = v
            if best is not None:
                return best
        raise ValueError("Unsupported JSON structure (need JSON array or JSONL).")

def write_jsonl(path: str, rows: List[Dict[str, Any]]) -> None:
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


In [9]:
# Load paraphrased dataset
paraphrased = read_json_maybe_jsonl(PARAPHRASED_PATH)
print(f"Loaded paraphrased items: {len(paraphrased):,}")

# Load Alpaca (for looking up the 'input' field via row_index)
alpaca = load_dataset("tatsu-lab/alpaca", split="train")
print("Alpaca size:", len(alpaca))


Loaded paraphrased items: 21,426
Alpaca size: 52002


In [10]:
def build_user_prompt(instruction: str, input_text: str) -> str:
    instr = (instruction or "").strip()
    inp   = (input_text or "").strip()
    if inp:
        return PROMPT_WITH_INPUT.format(instruction=instr, input=inp)
    else:
        return PROMPT_NO_INPUT.format(instruction=instr)

def to_openai_example(ex: Dict[str, Any]) -> Dict[str, Any]:
    """
    Convert one paraphrased row to OpenAI chat FT format:
      {"messages":[{"role":"user","content":<prompt>},
                   {"role":"assistant","content":<target>}]}
    """
    ri = ex.get("row_index")
    # Look up the Alpaca 'input' by row_index (skip if out of bounds or missing)
    alp_input = ""
    if isinstance(ri, int) and 0 <= ri < len(alpaca):
        alp_input = alpaca[ri].get("input") or ""

    user_prompt = build_user_prompt(ex.get("instruction", ""), alp_input)
    target = (ex.get(TARGET_FIELD) or "").strip()
    if not user_prompt.strip() or not target:
        return {}  # mark as skip

    return {
        "messages": [
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": target},
        ]
    }

# Map all rows
formatted = []
skipped = 0
for ex in paraphrased:
    row = to_openai_example(ex)
    if not row:
        skipped += 1
        continue
    formatted.append(row)

print(f"Prepared {len(formatted):,} examples; skipped {skipped:,}.")
print("Sample formatted example:\n", json.dumps(formatted[2], ensure_ascii=False, indent=2)[:700])


Prepared 21,426 examples; skipped 0.
Sample formatted example:
 {
  "messages": [
    {
      "role": "user",
      "content": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nEvaluate this sentence for spelling and grammar mistakes\n\n### Input:\nHe finnished his meal and left the resturant\n\n### Response:"
    },
    {
      "role": "assistant",
      "content": "He completed his food and departed from the eatery."
    }
  ]
}


In [11]:
write_jsonl(OUT_TRAIN_JSONL, formatted)
print(f"Wrote fine-tune JSONL → {os.path.abspath(OUT_TRAIN_JSONL)}  ({len(formatted):,} lines)")


Wrote fine-tune JSONL → /home/user/finetune_train.jsonl  (21,426 lines)
