In [None]:
from datasets import load_dataset

# Load a built-in dataset, e.g., 'imdb'
dataset = load_dataset("llm-uncertainty-head/train_akimbio_mistral")



In [None]:
from transformers import AutoTokenizer

# Replace with the exact model you're using, e.g., 'mistralai/Mistral-7B-Instruct-v0.2'
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Example input_ids (truncated from your example)
input_ids = [1, 733, 16289, 28793, 15259, 528, 264, 17004, 302, 14003]

# Detokenize
decoded_text = tokenizer.decode(dataset['train'][0]['input_ids'], skip_special_tokens=True)

print(decoded_text)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import json

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Load dataset
dataset = load_dataset("llm-uncertainty-head/train_akimbio_mistral")
train_data = dataset["train"]
dev_data = dataset["eval"]

def decode_input_ids(input_ids):
    return tokenizer.decode(input_ids, skip_special_tokens=True)

def format_claims(claims):
    formatted = []
    for c in claims:
        if "claim_text" in c and "aligned_token_ids" in c:
            claim = c["claim_text"].strip().replace("\n", " ")
            ids = ",".join(str(i) for i in c["aligned_token_ids"])
            formatted.append(f"{claim} ||| {ids}")
    return "\n".join(formatted)

def process_dataset(split_data):
    results = []
    for example in tqdm(split_data, desc="Processing"):
        input_text = decode_input_ids(example["input_ids"])
        output_text = format_claims(example["claims"])
        results.append({
            "input_text": input_text,
            "target_text": output_text
        })
    return results

# Process and save
train_processed = process_dataset(train_data)
dev_processed = process_dataset(dev_data)

# Optional: Save to disk
with open("train_claims.jsonl", "w") as f:
    for ex in train_processed:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

with open("dev_claims.jsonl", "w") as f:
    for ex in dev_processed:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")

print("✅ Done. Files saved: train_claims.jsonl, dev_claims.jsonl")


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Restrict to GPU 0 only

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch
from transformers import BitsAndBytesConfig

# --- Configuration ---
model_name = "Qwen/Qwen3-0.6B"
dataset_path = {
    "train": "train_claims.jsonl",
    "validation": "dev_claims.jsonl"
}
output_dir = "./qwen3-06b-qlora-claims"

# --- Load dataset ---
dataset = load_dataset("json", data_files=dataset_path)

# --- Load tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# --- Preprocessing ---
def preprocess(example):
    input_enc = tokenizer(
        example["input_text"],
        max_length=1024,
        truncation=True,
        padding="max_length"
    )
    target_enc = tokenizer(
        example["target_text"],
        max_length=1024,
        truncation=True,
        padding="max_length"
    )

    # Ensure labels match input length and are masked correctly
    labels = target_enc["input_ids"]
    labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]

    return {
        "input_ids": input_enc["input_ids"],
        "attention_mask": input_enc["attention_mask"],
        "labels": labels
    }

tokenized_datasets = dataset.map(preprocess, remove_columns=dataset["train"].column_names)

# --- Force single GPU usage ---
torch.cuda.set_device(0)
device_map = {"": 0}  # All tensors on GPU 0

# --- Load model with QLoRA config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# --- Training setup ---
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    remove_unused_columns=False,
    report_to="none",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)

# --- Train ---
trainer.train()

# --- Save final adapter ---
model.save_pretrained(output_dir + "/final")
tokenizer.save_pretrained(output_dir + "/final")
print("✅ Training complete. Model saved to:", output_dir + "/final")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm

# --- Load model and tokenizer ---
model_dir = "./qwen3-06b-qlora-claims/final"
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)

# --- Load validation set ---
dataset = load_dataset("json", data_files={"validation": "dev_claims.jsonl"})["validation"]

# --- Generate predictions ---
def generate_claims(example, max_new_tokens=256):
    input_ids = tokenizer(example["input_text"], return_tensors="pt", truncation=True, max_length=512).input_ids.to(model.device)
    with torch.no_grad():
        output = model.generate(input_ids, max_new_tokens=max_new_tokens, do_sample=False)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded

# --- Run on all dev examples ---
predictions = []
for example in tqdm(dataset):
    pred = generate_claims(example)
    predictions.append({
        "input_text": example["input_text"],
        "target_text": example["target_text"],
        "predicted_text": pred
    })

# --- Save results (optional) ---
import json
with open("dev_predictions.jsonl", "w") as f:
    for item in predictions:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("✅ Inference complete. Saved to dev_predictions.jsonl")


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load your fine-tuned model
model_dir = "./qwen3-06b-qlora-claims/final"
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
model.eval()

# Your input (from test or dev)
input_text = "[INST] Tell me a bio of Alice Walker. [/INST] Alice Walker (born February 9, 1944) is an American novelist, short story writer, poet, and activist. She is best known for her novel \"The Color Purple\" (1982), which gained critical and commercial success and was adapted into a successful film in 1985. The novel explores the experiences of African American women in the rural South and has become a classic of contemporary literature.\n\nWalker was born in Eatonton, Georgia, and grew up in a family of sharecroppers. She attended Spelman College in Atlanta, where she studied art and writing. After graduating, she worked as a social worker and a teacher before moving to Mississippi to join the Civil Rights Movement. She became involved with the Student Nonviolent Coordinating Committee (SNCC) and the Black Panther Party.\n\nWalker's writing career began in the late 1960s, and she published her first collection of poems, \"Once,\" in 1966. Her first novel, \"The Third Life of Grange Copeland,\" was published in 197"

# Tokenize input
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate output
with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=256,
        do_sample=False,           # greedy decoding for determinism
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

# Decode output
decoded = tokenizer.decode(output[0], skip_special_tokens=True)

# Remove input_text if it's repeated in output (common in causal LMs)
if decoded.startswith(input_text):
    decoded = decoded[len(input_text):].strip()

# Print result
print("📥 Input:")
print(input_text)
print("\n📤 Generated Claims:")
print(decoded)


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import json

# Load test set
dataset = load_dataset("llm-uncertainty-head/train_akimbio_mistral")
test_set = dataset["test"]

# Load fine-tuned model
model_dir = "./qwen3-06b-qlora-claims/final"
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", trust_remote_code=True)
model.eval()

# Inference function
def generate_claims(question, reply):
    model_input = f"[INST] {question} [/INST] {reply}"
    inputs = tokenizer(model_input, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    if decoded.startswith(model_input):
        decoded = decoded[len(model_input):].strip()
    return decoded

# Generate and save predictions
predictions = []
for example in tqdm(test_set):
    question = example["question"]
    reply = example["reply"]
    pred = generate_claims(question, reply)
    predictions.append({
        "question": question,
        "reply": reply,
        "predicted_claims": pred
    })

# Save to JSONL
with open("qwen3_test_predictions.jsonl", "w", encoding="utf-8") as f:
    for p in predictions:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print("✅ Inference complete. Saved to qwen3_test_predictions.jsonl")
