# Full LoRA Train (Llama 3.1 8B Instruct @ 4-bit)


# ⬇️ 1 — Installs deps and checks GPU

In [1]:
!pip -q install -U transformers peft trl datasets accelerate bitsandbytes

import torch, platform, os
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
!nvidia-smi || echo "No nvidia-smi (CPU runtime?)"
print("Python:", platform.python_version())

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m118.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.9/511.9 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hTorch: 2.8.0+cu126
CUDA available: True
Fri Aug 22 10:18:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | B

# ⬇️ 2 — Mounts Google Drive (for outputs)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

# ==== Project paths (edit if you changed them) ====
DATA_DIR      = "/content/drive/MyDrive/ai_shipping_agent/data/v0.2"
TRAIN_JSONL   = f"{DATA_DIR}/mini_sft.jsonl"
VAL_JSONL     = f"{DATA_DIR}/mini_sft_val.jsonl"

# Where to save adapters on Drive
SAVE_DIR      = "/content/drive/MyDrive/ai_shipping_agent/adapters/llama3.1-8b-lora-day4"

# HF base model — use Llama 3.1 8B Instruct; if gated or unavailable, uncomment fallback
BASE_MODEL    = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"  # fallback

os.makedirs(SAVE_DIR, exist_ok=True)
print("Data dir:", DATA_DIR)
print("Save dir:", SAVE_DIR)


Mounted at /content/drive
Data dir: /content/drive/MyDrive/ai_shipping_agent/data/v0.2
Save dir: /content/drive/MyDrive/ai_shipping_agent/adapters/llama3.1-8b-lora-day4


# ⬇️ 3 — Hugging Face auth



In [3]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# ⬇️ 4 — Loads v0.2 dataset (20k pairs)

In [4]:
from datasets import load_dataset

ds_train = load_dataset("json", data_files=TRAIN_JSONL, split="train")
ds_val   = load_dataset("json", data_files=VAL_JSONL,   split="train")

print(ds_train)
print(ds_val)
print("Example:", ds_train[0])
# Expected keys: 'input', 'assistant_response'


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['input', 'assistant_response'],
    num_rows: 17600
})
Dataset({
    features: ['input', 'assistant_response'],
    num_rows: 2400
})
Example: {'input': ' I just want my $10 back', 'assistant_response': "Thanks for reaching out. Here's what I'll do:\n1) Check the latest scan and status.\n2) Confirm delivery address.\n3) Share an ETA or next step.\n\n@[REDACTED_PHONE] Here to help. Kindly send a contact information with email address so we can connect."}


# ⬇️ 5 — Tokenizer & Base Model (4‑bit)

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Prefer bf16 compute on L4; fp16 is fine if bf16 not available
compute_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tok.pad_token = tok.eos_token  # safety for packing

base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print("Loaded:", BASE_MODEL)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Loaded: meta-llama/Meta-Llama-3.1-8B-Instruct


# ⬇️ 6 — Build Supervised Text via Chat Template

In [6]:
# We format each example as a chat conversation so the instruct model learns the right turn-taking.
# The tokenizer's chat template will add BOS/EOS and roles correctly.

SYSTEM_PREFIX = (
    "You are a shipping support assistant. "
    "Ask for missing IDs, never include links, never claim live tracking. "
    "Keep answers concise with 2–4 bullet steps and defer facts to retrieval."
)

def to_chat_text(example):
    msgs = [
        {"role": "system", "content": SYSTEM_PREFIX},
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["assistant_response"]},
    ]
    return {"text": tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)}

ds_train_text = ds_train.map(to_chat_text, remove_columns=ds_train.column_names)
ds_val_text   = ds_val.map(to_chat_text,   remove_columns=ds_val.column_names)

print("Sample formatted text:\n", ds_train_text[0]["text"][:500])


Map:   0%|          | 0/17600 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Sample formatted text:
 <|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a shipping support assistant. Ask for missing IDs, never include links, never claim live tracking. Keep answers concise with 2–4 bullet steps and defer facts to retrieval.<|eot_id|><|start_header_id|>user<|end_header_id|>

I just want my $10 back<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Thanks for reaching out. Here's what I'll do:
1) Check the latest


# ⬇️ 7 — LoRA Config (Attention‑only)

In [7]:
from peft import LoraConfig, get_peft_model

# Conservative LoRA for style/tone alignment (attention-only)
lora_cfg = LoraConfig(
    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.1,
    bias = "none",
    task_type = "CAUSAL_LM",
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
)


# ⬇️ 8 — Trainer (TRL SFTTrainer)

In [8]:
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling
from transformers import EarlyStoppingCallback


BATCH_SIZE_PER_DEVICE = 1
GRAD_ACCUM_STEPS     = 16
LR                   = 2e-4
EPOCHS               = 1

collator = DataCollatorForLanguageModeling(tok, mlm=False)

sft_args = SFTConfig(
    # --- core training ---
    output_dir=SAVE_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=GRAD_ACCUM_STEPS,
    learning_rate=LR,
    warmup_ratio=0.03,
    optim="paged_adamw_32bit",
    max_grad_norm=1.0,
    gradient_checkpointing=True,

    # --- logging / eval / save ---
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    eval_strategy="steps",    # NOTE: eval_strategy (new), not evaluation_strategy
    eval_steps=500,
    report_to="none",

    # --- precision ---
    bf16=(compute_dtype==torch.bfloat16),
    fp16=(compute_dtype==torch.float16),

    # --- dataset formatting ---
    dataset_text_field="text",
    max_length=1024,
    packing=False,            # can try True later for throughput
    # --- Save best mode ---
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = SFTTrainer(
    model=base,               # if you already wrapped with PEFT; otherwise pass peft_config=...
    args=sft_args,
    train_dataset=ds_train_text,
    eval_dataset=ds_val_text,
    data_collator=collator,
    processing_class=tok,     # NOTE: replaces `tokenizer=` in newer TRL
    peft_config=lora_cfg,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
metrics = trainer.evaluate()
metrics

Adding EOS to train dataset:   0%|          | 0/17600 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/17600 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/17600 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2400 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
500,0.8026,0.788679
1000,0.7778,0.771167


{'eval_loss': 0.7711672186851501,
 'eval_runtime': 749.0221,
 'eval_samples_per_second': 3.204,
 'eval_steps_per_second': 3.204}

# ⬇️ 9 — Save Adapter + Tokenizer

In [9]:
# Save adapter-only (small) + tokenizer for downstream use
trainer.model.save_pretrained(SAVE_DIR)
tok.save_pretrained(SAVE_DIR)
print("Saved adapter + tokenizer to:", SAVE_DIR)


Saved adapter + tokenizer to: /content/drive/MyDrive/ai_shipping_agent/adapters/llama3.1-8b-lora-day4


# ⬇️ 10 — Reload with PEFT

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16)

tok2 = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
tok2.pad_token = tok2.eos_token
base2 = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto", trust_remote_code=True)
model2 = PeftModel.from_pretrained(base2, SAVE_DIR)
model2.eval()
print("Adapter loaded OK.")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Adapter loaded OK.


# ⬇️ 11 — Guarded Inference Helper and Quick Eval Set (Ask‑Before‑Answer, Concise)

In [18]:
# ---- Guarded inference (drop-in replacement) ----
import re, torch
from transformers import GenerationConfig, LogitsProcessorList, NoBadWordsLogitsProcessor

SYSTEM_PREFIX = (
    "You are a shipping support assistant. "
    "Ask for missing IDs, never include links, never claim live tracking. "
    "Keep answers concise with 2–4 bullet steps and defer facts to retrieval."
)

# Token-level blocks for links/handles/marketing phrasing
BAD_PATTERNS = [
    "http://","https://","www.",".com",".net",".org",".io",".co",".ly",
    "@"," DM ","direct message","^"," #"," link "," url "," website "
]
bad_words_ids = [ids for pat in BAD_PATTERNS if (ids := tok2.encode(pat, add_special_tokens=False))]

def _missing_id(user_text: str) -> bool:
    t = user_text.lower()
    return not re.search(r"(awb|waybill|tracking|order)\s*(no|number|id)|\b\d{8,}\b", t)

def _wants_link(user_text: str) -> bool:
    return bool(re.search(r"\b(link|url|website|web\s*site)\b", user_text, re.I))

def _postprocess(text: str, require_id: bool, refuse_link: bool) -> str:
    # strip meta + urls/handles/stock phrases
    text = re.sub(r'(?i)cutting knowledge date:.*|today date:.*', '', text)
    text = re.sub(r'https?://\S+|www\.\S+|\S+\.(com|net|org|io|co|ly)\b', '', text)
    text = re.sub(r'(?i)@\w+|dm|direct message', '', text)
    text = re.sub(r'(?i)(thanks for reaching out|we\'?re here to help|view it here).*', '', text)
    text = re.sub(r'[\*\~_]{1,3}', '', text)

    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]

    # Normalize to "- " bullets
    bullets = []
    for ln in lines:
        m = re.match(r'^\s*\d+[.)]\s*(.+)$', ln)  # "1) foo" or "2. bar" -> "- foo"
        if m:
            ln = "- " + m.group(1).strip()
        if re.match(r'^(\-|\*)\s', ln):
            ln = re.sub(r'^\*\s', '- ', ln)                # "* foo" -> "- foo"
            ln = re.sub(r'\s*[\.·•-]+$', '', ln)           # tidy trailing punctuation/markers
            bullets.append(ln)

    if not bullets:
        # fallback: make bullets from sentences
        sents = re.split(r'(?<=[.!?])\s+', ' '.join(lines))
        bullets = [f"- {s.strip()}" for s in sents if s.strip()][:4]

    # Insert explicit asks/refusals when needed
    if require_id and not any(re.search(r'(tracking|waybill|order)\s*(number|id)', b.lower()) for b in bullets):
        bullets.insert(0, "- Please share your tracking/waybill number and the carrier (e.g., Shipping_A).")

    if refuse_link and not any(re.search(r"can('|no)t share links|cannot share links", b, re.I) for b in bullets):
        bullets.insert(0, "- I can’t share tracking links. Use the carrier’s official site/app with your waybill number.")

    # Ensure all are "- " bullets, cap to 4
    bullets = [b if b.startswith("- ") else "- " + b.lstrip("-* ").strip() for b in bullets[:4]]
    return "\n".join(bullets)

def infer_guarded(user_msg, ctx=None, max_new_tokens=140):
    messages = [{"role":"system","content":SYSTEM_PREFIX}]
    if ctx:
        messages.append({"role":"user","content":"(Context — citations):\n" + "\n".join(f"- {c}" for c in ctx)})
    messages.append({"role":"user","content":user_msg})

    # Build prompt as STRING, then tokenize to dict
    prompt = tok2.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok2(prompt, return_tensors="pt")
    inputs = {k: v.to(model2.device) for k, v in inputs.items()}

    # Silence sampling warnings by using an explicit generation_config
    gen_cfg = GenerationConfig(
        do_sample=False,                # greedy
        no_repeat_ngram_size=4,
        repetition_penalty=1.15,
        pad_token_id=tok2.eos_token_id,
        eos_token_id=tok2.eos_token_id,
    )
    processors = LogitsProcessorList([NoBadWordsLogitsProcessor(bad_words_ids, eos_token_id=tok2.eos_token_id)])

    with torch.inference_mode():
        out = model2.generate(
            **inputs,
            generation_config=gen_cfg,
            logits_processor=processors,
            max_new_tokens=max_new_tokens,
        )

    # Decode ONLY the new tokens after the prompt
    prompt_len = inputs["input_ids"].shape[1]
    gen_tokens = out[0, prompt_len:]
    raw = tok2.decode(gen_tokens, skip_special_tokens=True).strip()

    return _postprocess(
        raw,
        require_id=_missing_id(user_msg),
        refuse_link=_wants_link(user_msg),
    )


In [19]:
EVAL = [
    "Track order 12345 with Shipping_A and give the last two scan events.",
    "Track my order",
    "Give me the DHL tracking link now",
    "Customs hold message received. What documents are needed?",
    "Package marked delivered but I didn't get it.",
]
for i, q in enumerate(EVAL, 1):
    print(f"\n— Q{i}: {q}\n")
    print(infer_guarded(q, ctx=[
        "Shipments require a valid waybill/tracking number.",
        "Customs may request ID, invoice, proof of payment."
    ])[:900])



— Q1: Track order 12345 with Shipping_A and give the last two scan events.

- Please share your tracking/waybill number and the carrier (e.g., Shipping_A).
- Check the latest scan and status
- Confirm delivery address
- Share an update on progress

— Q2: Track my order

- Please share your tracking/waybill number and the carrier (e.g., Shipping_A).
- Check the latest scan and status
- Confirm delivery address
- Share an update or next step

— Q3: Give me the DHL tracking link now

- I can’t share tracking links. Use the carrier’s official site/app with your waybill number.
- Please share your tracking/waybill number and the carrier (e.g., Shipping_A).
- Check the latest scan and status
- Confirm delivery address

— Q4: Customs hold message received. What documents are needed?

- Please share your tracking/waybill number and the carrier (e.g., Shipping_A).
- Check the latest scan and status
- Confirm delivery address
- Share an ETA or next step

— Q5: Package marked delivered but I did

⬇️ 11 —Quick pass/fail counters (objective view)

In [20]:
def asks_for_ids(text):
    return bool(re.search(r"(tracking|waybill|order)\s*(number|id)", text.lower()))

def contains_linkish(text):
    return bool(re.search(r'https?://|www\.|\.com|\.net|\.org|@\w+', text.lower()))

def is_concise_bullets(text):
    bullets = re.findall(r'^\s*[\-\*\d\)]', text, flags=re.M)
    return 2 <= len(bullets) <= 4

scores = {"ask_ids":0,"no_links":0,"concise":0}
for q in EVAL:
    a = infer_guarded(q)
    scores["ask_ids"]  += int(asks_for_ids(a))
    scores["no_links"] += int(not contains_linkish(a))
    scores["concise"]  += int(is_concise_bullets(a))
print("Pass counts:", scores, "out of", len(EVAL))


Pass counts: {'ask_ids': 5, 'no_links': 5, 'concise': 5} out of 5


⬇️ 12 — Save as HF Adapter (for sharing)

In [21]:
# If you want to push the adapter to the Hub (private or public):
from huggingface_hub import HfApi, upload_folder
api = HfApi()

HF_REPO_ID = "GhaithOmar/ai-shipping-agent-llama3.1-8b-lora-day4"  # choose a name
api.create_repo(HF_REPO_ID, private=True, exist_ok=True)

upload_folder(
    repo_id=HF_REPO_ID,
    folder_path=SAVE_DIR,         # your adapter dir
    repo_type="model",
)
print("Uploaded adapter to:", HF_REPO_ID)

optimizer.pt:   0%|          | 0.00/54.7M [00:00<?, ?B/s]

Upload 14 LFS files:   0%|          | 0/14 [00:00<?, ?it/s]

scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/54.7M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/6.22k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Uploaded adapter to: GhaithOmar/ai-shipping-agent-llama3.1-8b-lora-day4
