### Process dataset

In [None]:
import json

# Load original JSON file
with open("data/sampled_arxiv.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Prepare clean samples
cleaned_data = []
for entry in raw_data:
    title = entry.get("title", "").replace("\n ", "").strip()
    abstract = entry.get("abstract", "").replace("\n", " ").strip()
    
    if title and abstract:
        cleaned_data.append({
            "instruction": "Write an academic paragraph given the title.",
            "input": title,
            "output": abstract
        })

# Save to train.jsonl
with open("data/mistral_train.jsonl", "w", encoding="utf-8") as f:
    for item in cleaned_data:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

### Prepare dataset

In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset

# === Load tokenizer ===
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# === Load and tokenize dataset ===
dataset = load_dataset("json", data_files="../data/mistral_train.jsonl", split="train")

def format_prompt(example):
    return f"""### Instruction:
{example['instruction']}

### Input:
{example['input']}

### Response:
{example['output']}"""

def tokenize(example):
    prompt = format_prompt(example)
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

dataset = dataset.map(tokenize, batched=False)
dataset.save_to_disk("../data/tokenized_mistral")


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.61s/it]
Generating train split: 500000 examples [00:00, 1045992.92 examples/s]
Map: 100%|██████████| 500000/500000 [04:00<00:00, 2080.40 examples/s]
Saving the dataset (8/8 shards): 100%|██████████| 500000/500000 [00:01<00:00, 272625.18 examples/s]


### Training

In [1]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_from_disk
import torch

model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# === QLoRA: Quantized loading config ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# === Load base model in 4-bit ===
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# === Prepare model for k-bit training ===
model = prepare_model_for_kbit_training(model)

# === LoRA config ===
lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

dataset = load_from_disk("../data/tokenized_mistral")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# === Training config ===
training_args = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=8,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_dir="logs",
    logging_steps=20,
    save_strategy="steps",
    save_steps=600,
    fp16=True,
    bf16=False,
    save_total_limit=2,
    report_to="none"
)

# === Train! ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train(resume_from_checkpoint="output/checkpoint-49800")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.65s/it]
  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
49820,0.9342
49840,0.8644
49860,0.8586
49880,0.8298
49900,0.8584
49920,0.8122
49940,0.8838
49960,0.9139
49980,0.8761
50000,0.8705


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=62500, training_loss=0.17853864431762695, metrics={'train_runtime': 44051.5995, 'train_samples_per_second': 11.35, 'train_steps_per_second': 1.419, 'total_flos': 1.1005725769728e+19, 'train_loss': 0.17853864431762695, 'epoch': 1.0})