# Haithm Style Fine-Tuning (Qwen 3B QLoRA)

This notebook fine-tunes the Qwen 2.5 3B Instruct model on Haithm's personal writing style using QLoRA.

## 1. Setup
**Before running:**
1. Upload your dataset files (`dataset_haithm_style_natural.jsonl`, `dataset_haithm_style_prompts.jsonl`) to the Colab runtime (drag & drop to file sidebar).
2. Enable GPU Runtime: `Runtime` -> `Change runtime type` -> `T4 GPU` (or better).

In [None]:
!pip install -q -U torch transformers peft datasets bitsandbytes trl

## 2. Load Model & Dataset

In [None]:
# [ANTIGRAVITY] EXPORT USED DATASET (AUDIT CELL)
# Run this IMMEDIATELY after defining 'trainer' and BEFORE 'trainer.train()'

import os
import json
import hashlib

RUN_ID = "hs-20251213-v2.5"
AUDIT_DIR = f"/content/run_audit/{RUN_ID}"
os.makedirs(AUDIT_DIR, exist_ok=True)

def get_content_hash(text):
    return hashlib.sha256(str(text).encode("utf-8")).hexdigest()

def get_file_hash(filepath):
    if not os.path.exists(filepath): return "MISSING", 0, 0
    with open(filepath, "rb") as f:
        data = f.read()
    return hashlib.sha256(data).hexdigest(), len(data), len(data.splitlines())

def scan_artifacts(text):
    artifacts = ["\ue200filecite", "turn0file", "turn1file", "[STATE:"]
    found = []
    for art in artifacts:
        if art in text:
            found.append(art)
    return found

def export_hf_dataset(hf_ds, filename):
    if not hf_ds:
        print(f"Skipping {filename} (None)")
        return None
        
    out_path = os.path.join(AUDIT_DIR, filename)
    count = 0
    artifact_count = 0
    
    with open(out_path, "w", encoding="utf-8") as f:
        for i, item in enumerate(hf_ds):
            # Extract text content for canonical ID
            # SFTTrainer default field is often 'text' after formatting, or 'input_ids' if tokenized.
            # If packing=False, it preserves original columns too.
            content_str = str(item)
            if "text" in item: content_str = item["text"]
            elif "output" in item: content_str = item["output"] 
            
            # Generate Stable ID
            if "example_id" not in item:
                item["example_id"] = get_content_hash(content_str)[:16]
            
            # Artifact Scan
            found_arts = scan_artifacts(content_str)
            if found_arts:
                artifact_count += 1
                if count < 5: # Log first few
                    print(f"[Audit Warning] Artifacts {found_arts} found in {filename} row {i}")

            f.write(json.dumps(item, ensure_ascii=False) + "\n")
            count += 1
    
    # Compute Hash of Exported File
    file_hash, bytes_count, lines = get_file_hash(out_path)
    print(f"Exported {filename}: {count} rows, Hash: {file_hash[:8]}..., Artifacts found: {artifact_count}")
    return {"filename": filename, "count": count, "sha256": file_hash, "bytes": bytes_count, "artifacts_found": artifact_count}

print("--- Starting Audit Export ---")

# 1. Source Manifest
source_files = [
    "dataset_haithm_style_natural_v2.jsonl",
    "dataset_haithm_style_prompts.jsonl",
    "dataset_haithm_style_persona_v2.jsonl",
    "dataset_haithm_style_cognitive_v2.jsonl",
    "dataset_haithm_v3_cognitive_map.jsonl"
]
source_manifest = []
for fname in source_files:
    h, b, l = get_file_hash(fname)
    source_manifest.append({"filename": fname, "sha256": h, "bytes": b, "lines": l})

# 2. Export Trainer Datasets
train_meta = export_hf_dataset(trainer.train_dataset, "train_used.jsonl")
eval_meta = export_hf_dataset(trainer.eval_dataset, "val_used.jsonl")

# 3. Final Manifest
manifest_used = {
    "run_id": RUN_ID,
    "source_files": source_manifest,
    "exported_datasets": [d for d in [train_meta, eval_meta] if d]
}

with open(os.path.join(AUDIT_DIR, "manifest_used.json"), "w") as f:
    json.dump(manifest_used, f, indent=4)

print(f"Audit Complete. Manifest saved to {AUDIT_DIR}/manifest_used.json")
print(json.dumps(manifest_used, indent=2))

In [None]:
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

# Config
model_name = "Qwen/Qwen2.5-3B-Instruct"
new_model = "qwen-3b-haithm-style-lora"

# Load Datasets (Weighted Mixing)
print("Loading datasets with V2.5 weights...")

# 1. Load raw files
ds_natural = load_dataset("json", data_files={"train": "dataset_haithm_style_natural_v2.jsonl"}, split="train")
ds_prompts = load_dataset("json", data_files={"train": "dataset_haithm_style_prompts.jsonl"}, split="train")
ds_persona = load_dataset("json", data_files={"train": "dataset_haithm_style_persona_v2.jsonl"}, split="train")
ds_cognitive = load_dataset("json", data_files={"train": "dataset_haithm_style_cognitive_v2.jsonl"}, split="train")
ds_map = load_dataset("json", data_files={"train": "dataset_haithm_v3_cognitive_map.jsonl"}, split="train")

# 2. Apply Weights (Concatenation)
# Weights: Natural=1, Prompts=3, Persona=6, Cognitive=6, Map=50
datasets_to_merge = []

datasets_to_merge.append(ds_natural) # x1

for _ in range(3):
    datasets_to_merge.append(ds_prompts)

for _ in range(6):
    datasets_to_merge.append(ds_persona)

for _ in range(6):
    datasets_to_merge.append(ds_cognitive)
    
for _ in range(50):
    datasets_to_merge.append(ds_map)

dataset = concatenate_datasets(datasets_to_merge)
dataset = dataset.shuffle(seed=42) # Shuffle to mix them

print(f"Merged datasets. Total examples: {len(dataset)}")

# 4-bit Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

## 3. Training (QLoRA)

In [None]:
# LoRA Config
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Training Params
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
)

# Traverse
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="output",
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,
)

trainer.train()

## 4. Save Adapter

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)

print(f"Model saved to {new_model}")

# Zip for download
!zip -r {new_model}.zip {new_model}
print("Download the zip file from the file browser.")