In [11]:
# convert_to_style_format.py
#"D:\machine learning\PEFT LORA PHI - ARTICLE WRITER\my_article.txt"
input_file = "my_article.txt"        # your input article file
output_file = "my_style_raw.txt"     # formatted output file

with open(input_file, "r", encoding="utf-8") as f:
    text = f.read()


# Split by periods (.)
sentences = text.split(".")


# Clean and filter short/empty lines
sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
print(len(sentences))
# Write in [STYLE_SAMPLE] format
with open(output_file, "w", encoding="utf-8") as f:
    for s in sentences:
        f.write(f"[STYLE_SAMPLE]\n{s.strip()}.\n\n")

print(f"✅ Done! {len(sentences)} style samples saved to '{output_file}'.")


75
✅ Done! 75 style samples saved to 'my_style_raw.txt'.


In [18]:
"""
build_style_dataset.py

Expect two input files (both using blocks like):
[STYLE_SAMPLE]
Some text here.

[STYLE_SAMPLE]
Another text here.

One file should contain the "GPT-style" (plain) lines, the other file should
contain your "my-style" lines. They are paired in order (1st -> 1st, 2nd -> 2nd).

Output: style_dataset.jsonl (one json per line: instruction,input,output)
Also optionally splits into train/val.
"""

import json, re, os, random
from pathlib import Path

# ---------- SETTINGS (edit if needed) ----------
gpt_file = "gimini_style.txt"        # file with generic ChatGPT-style rephrasings
my_file  = "my_style_raw.txt"         # file with your style samples (targets)
out_file = "style_dataset.jsonl"  # final dataset
train_file = "train.jsonl"
val_file = "val.jsonl"
val_frac = 0.10                   # fraction to reserve for validation (0.0 to 0.5)
min_len_chars = 10                # minimum characters to accept a sample
shuffle_seed = 42
# ------------------------------------------------

def read_style_blocks(path):
    """Read file and return list of blocks under [STYLE_SAMPLE] markers."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} not found")
    text = Path(path).read_text(encoding="utf-8").lstrip("\ufeff")
    # split on [STYLE_SAMPLE] (case-insensitive)
    parts = re.split(r'(?i)\[STYLE_SAMPLE\]', text)
    samples = []
    for p in parts:
        s = p.strip()
        if not s:
            continue
        # remove leading punctuation/newlines and trailing whitespace
        s = re.sub(r'^\s*[:\-–—\.\n]+', '', s).strip()
        s = re.sub(r'\s+', ' ', s)  # collapse whitespace
        # ensure sentence ends with punctuation for safety
        if not re.search(r'[.!?]$', s):
            s = s + '.'
        if len(s) >= min_len_chars:
            samples.append(s)
    return samples

def build_pairs(gpt_samples, my_samples):
    if len(gpt_samples) != len(my_samples):
        print(f"Warning: counts differ — GPT samples: {len(gpt_samples)}, My samples: {len(my_samples)}")
        # pair up to min length
    n = min(len(gpt_samples), len(my_samples))
    pairs = [(gpt_samples[i], my_samples[i]) for i in range(n)]
    return pairs

def write_jsonl(records, path):
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def main():
    print("Reading GPT-style samples from:", gpt_file)
    gpt_samples = read_style_blocks(gpt_file)
    print("Reading my-style samples from:", my_file)
    my_samples = read_style_blocks(my_file)
    print("GPT samples found:", len(gpt_samples))
    print("My style samples found:", len(my_samples))

    pairs = build_pairs(gpt_samples, my_samples)
    print("Paired samples:", len(pairs))
    if len(pairs) == 0:
        print("No pairs to write. Exiting.")
        return

    # Build records
    records = []
    for gpt_text, my_text in pairs:
        # Instruction can be tuned; keep concise and explicit
        record = {
            "instruction": "Rewrite the following text in Goutam's writing style.",
            "input": gpt_text,
            "output": my_text
        }
        # alternative formats:
        # {"prompt": f"Rewrite in my style: {gpt_text}", "completion": my_text}
        records.append(record)

    # Shuffle deterministically then split
    random.Random(shuffle_seed).shuffle(records)
    if 0.0 < val_frac < 0.5:
        n_val = max(1, int(len(records) * val_frac))
        val = records[:n_val]
        train = records[n_val:]
        print(f"Writing train ({len(train)}) and val ({len(val)}) files.")
        write_jsonl(train, train_file)
        write_jsonl(val, val_file)
    else:
        print(f"No validation split requested (val_frac={val_frac}). Writing full dataset.")

    # write main dataset file (shuffled)
    write_jsonl(records, out_file)
    print("Wrote dataset to:", out_file)
    if os.path.exists(train_file):
        print("Wrote train to:", train_file, "val to:", val_file)

    # show first 3 records as sanity check
    print("\nEXAMPLE records:")
    for i, r in enumerate(records[:3]):
        print("---")
        print("IN :", r['input'])
        print("OUT:", r['output'][:120], "..." if len(r['output'])>120 else "")

if __name__ == "__main__":
    main()


Reading GPT-style samples from: gimini_style.txt
Reading my-style samples from: my_style_raw.txt
GPT samples found: 75
My style samples found: 75
Paired samples: 75
Writing train (68) and val (7) files.
Wrote dataset to: style_dataset.jsonl
Wrote train to: train.jsonl val to: val.jsonl

EXAMPLE records:
---
IN : The burgeoning landscape of AI tools presents an evolving challenge for educators and students alike, with definitive guidelines for their use still taking shape. Despite this fluidity, a broad consensus among professors underscores that reliance on these technologies without cultivating original thought or profound understanding ultimately undermines sustainable success.
OUT: As these tools are still emerging and it is tough to deterministically say what is the correct way to use them, however, ...
---
IN : Alok Takkar's proposal, while acknowledged as an intriguing concept, faces an admitted challenge: students could still circumvent the measure by memorizing LLM-generated ou

In [20]:
import json

# Input and output files
input_file = "style_dataset.jsonl"   # your JSONL input
output_file = "phi3_finetune_format.txt"    # Phi-3 compatible text format

with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
    for line in f_in:
        example = json.loads(line)
        instruction = example.get("instruction", "").strip()
        inp = example.get("input", "").strip()
        output = example.get("output", "").strip()

        # Merge instruction + input as user message
        user_message = instruction
        if inp:
            user_message += "\n" + inp

        # Write in Phi-3 chat style
        formatted = (
            f"<|user|>\n{user_message}\n<|end|>\n"
            f"<|assistant|>\n{output}\n<|end|>\n"
        )
        f_out.write(formatted + "\n")
