In [None]:
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training

In [None]:
HF_TOKEN        = os.getenv("HF_TOKEN")
BASE_MODEL      = "GingerBled/qwen3-0.6B-FullFineTune"
RAG_DS          = "GingerBled/MNLP_M2_mcqa_with_context"
OUT_REPO        = "GingerBled/MNLP_M2_rag_model"
OUTPUT_DIR      = "rag_lora_ft"
TOP_K           = 5
MICRO_BATCH   = 4             # ↓↓↓ memory
GRAD_ACC      = 8            # keeps effective batch at 16
LR            = 2e-4
MAX_LEN    = 512 + TOP_K * 512  # approx

In [None]:
ds = load_dataset(RAG_DS, split="train")

In [None]:
model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        device_map         = "auto",
        torch_dtype        = 'auto',   # or torch.float16
        trust_remote_code  = True)

model.gradient_checkpointing_enable()
#model.gradient_checkpointing_disable()
model.config.use_flash_attention_2 = True 

In [None]:
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_auth_token=HF_TOKEN)
tok.pad_token = tok.eos_token
tok.eos_token = "<|im_end|>"
tok.padding_side = "left"

In [None]:
peft_cfg = LoraConfig(
        r=16, lora_alpha=32, lora_dropout=0.05,
        target_modules="all-linear", bias="lora_only", task_type="CAUSAL_LM",
        target_modules=["q_proj","v_proj","k_proj","o_proj"]
        )

sft_cfg = SFTConfig(
        max_seq_length              = MAX_SEQ,
        packing                     = True,      # DO NOT forget!
        per_device_train_batch_size = MICRO_BATCH,
        gradient_accumulation_steps = GRAD_ACC,
        learning_rate               = LR,
        output_dir                  = OUTPUT_DIR,
        num_train_epochs = 1
)

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        peft_config=peft_cfg,
        args=sft_cfg)

torch.cuda.empty_cache()        # make sure nothing is lurking
trainer.train()

In [None]:
from huggingface_hub import HfApi
api = HfApi()
print("Saving model...")
trainer.save_model('SFT_model')
tok.save_pretrained('SFT_model')

In [None]:
merged = trainer.model.merge_and_unload()            # returns a plain transformers model
FULL_DIR = "rag_lora_merged"
merged.save_pretrained(FULL_DIR,
                       safe_serialization=True) # sharded upload if >2 GB
tok.save_pretrained(FULL_DIR)  

In [None]:
ORG   = "GingerBled"                     # <-- change
REPO  = "qwen3-0.6B-rag_generator_LoRA"       # final repo name
FULL_ID = f"{ORG}/{REPO}"

api.create_repo(
    repo_id     = FULL_ID,
    repo_type   = "model",
    private     = False,             # or True
    exist_ok    = True               # don’t fail if it already exists
)

api.upload_folder(
    folder_path   = FULL_DIR,
    repo_id       = FULL_ID,   # <- org/repo
    repo_type     = "model",
    path_in_repo  = ".",        # keep original layout
    commit_message= "Add final LoRA checkpoint"
)
