In [23]:
import os
import re
import ast
import json
import pathlib
import glob
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from typing import List, Dict, Any, Tuple

# trial = "state_dev"
# trial = "state_train_all"

# trial = "stateact-no-thoughts_train_all"
# trial = "state-no-thoughts_rag_train_all"

# trial = "react_train"


# trial = "state_rag_selfgenerated_train2/"
trial = "react_rag_selfgenerated_train2/"

# Configuration: adjust these paths as needed

METADATA_CSV = f"game_logs/{trial}/webshop_scores.csv"
METADATA_DIR = f"game_logs/{trial}_metadata/"
PROMPT_BASE_DIR = f"game_logs/{trial}/"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Something to include to fix unsloth on jupyter
os.environ["TRITON_DISABLE"] = "1"

In [24]:
# Load the metadata csv
metadata = pd.read_csv("/workspace/data/webshop/WebShop_metadata.csv")

In [25]:
metadata

Unnamed: 0,instruction,category,product_category
0,"find me sulfate free, paraben free shampoo & c...",beauty,Beauty & Personal Care › Hair Care › Shampoo &...
1,find me non slip shave & hair removal with sta...,beauty,Beauty & Personal Care › Shave & Hair Removal ...
2,find me women's dresses with polyester spandex...,fashion,"Clothing, Shoes & Jewelry › Women › Clothing ›..."
3,"find me space saving, easy assemble living roo...",garden,Home & Kitchen › Furniture › Dining Room Furni...
4,"find me long lasting, easy apply, cruelty free...",beauty,Beauty & Personal Care › Makeup › Face › Blush
...,...,...,...
529102,find me teeth whitening toothpaste for sensiti...,beauty,Beauty & Personal Care › Oral Care › Toothpaste
529103,find me machine wash men's tank tops with poly...,fashion,"Clothing, Shoes & Jewelry › Novelty & More › C..."
529104,find me high quality makeup brushes & tools fo...,beauty,Beauty & Personal Care › Tools & Accessories ›...
529105,find me super soft throw blankets with printin...,garden,Home & Kitchen › Bedding › Blankets & Throws ›...


In [26]:
df_expert = pd.read_csv("/workspace/data/webshop/webshop_human_index.csv")

df_train = pd.read_csv(METADATA_CSV)

In [27]:
# Load the eto dataset containing gold GPT trajectories
with open("/workspace/data/webshop/webshop_sft.json", "r") as f:
    eto = json.load(f)
    
instructions = []
categories = []
for row in eto:
    instr = row["conversations"][2]["value"][33:-13]
    instructions.append(instr)
    # break

df_eto = pd.DataFrame(eto)
df_eto["instruction"] = instructions

In [28]:
df_expert

Unnamed: 0,instruction,reward,file,category,product_category
0,i'm looking for a 4-tier shelving unit and tv ...,0.600000,"{'goal': {'asin': 'B09N382RP7', 'category': 'g...",garden,Home & Kitchen › Furniture › Living Room Furni...
1,i'm looking for 1.5 feet (10 pack) high speed ...,1.000000,"{'goal': {'asin': 'B01HJWDQWA', 'category': 'e...",electronics,Electronics › Accessories & Supplies › Audio &...
2,i would like to buy a heavy duty with a rocket...,1.000000,"{'goal': {'asin': 'B08TVG2339', 'category': 'g...",garden,Tools & Home Improvement › Electrical › Wall P...
3,i want to find a tv stand made of solid wood f...,0.666667,"{'goal': {'asin': 'B09LTRP7HW', 'category': 'g...",garden,Home & Kitchen › Furniture › Living Room Furni...
4,i need some skin care tools for dark circles w...,1.000000,"{'goal': {'asin': 'B09FXG949K', 'category': 'b...",beauty,
...,...,...,...,...,...
1638,"i need birthday candles for my birthday cake, ...",0.500000,"{'goal': {'asin': 'B07RVDW2Z2', 'category': 'g...",grocery,Home & Kitchen › Home Décor Products › Candles...
1639,i am looking for a teal color stainlesss steel...,1.000000,"{'goal': {'asin': 'B092J891H7', 'category': 'e...",electronics,Electronics › Wearable Technology › Arm & Wris...
1640,i need a pair of pink loafers for teen girls. ...,0.750000,"{'goal': {'asin': 'B096W49HCJ', 'category': 'f...",fashion,"Clothing, Shoes & Jewelry › Women › Shoes › Lo..."
1641,i am looking for a hair salon capacity spray b...,0.500000,"{'goal': {'asin': 'B08GCNBFX1', 'category': 'b...",beauty,Beauty & Personal Care › Hair Care › Hair Colo...


In [29]:
df_eto

Unnamed: 0,id,conversations,reward,source,instruction
0,0,"[{'from': 'human', 'value': 'You are web shopp...",1.00,human,i need a long clip-in hair extension which is ...
1,3,"[{'from': 'human', 'value': 'You are web shopp...",0.75,human,i need gluten free vegetarian smoked peppered ...
2,4,"[{'from': 'human', 'value': 'You are web shopp...",1.00,human,find me a high speed dual style package with ...
3,8,"[{'from': 'human', 'value': 'You are web shopp...",0.75,human,i want to find xx-large black workout sweatpan...
4,9,"[{'from': 'human', 'value': 'You are web shopp...",1.00,human,i am looking for a grey sectional sofa for my ...
...,...,...,...,...,...
1819,12036,"[{'from': 'human', 'value': 'You are web shopp...",1.00,gpt,i'm looking for a telescope with high power fo...
1820,12039,"[{'from': 'human', 'value': 'You are web shopp...",1.00,gpt,i am looking for a 3 vanity lights with with ...
1821,12050,"[{'from': 'human', 'value': 'You are web shopp...",1.00,gpt,"i need some grey living room pillow covers, an..."
1822,12077,"[{'from': 'human', 'value': 'You are web shopp...",1.00,gpt,i needed a 5-case gluten free multigrain table...


In [30]:
GOAL_RE   = re.compile(r"(?im)^\s*Goal\s*:\s*(.+?)\s*$")
SCORE_RE  = re.compile(r"Your score.*?:\s*([0-9]+(?:\.[0-9]+)?)", re.I)
def parse_self_traces(conversations: List[Dict[str,str]]) -> Tuple[List[Dict], Dict]:
    """
    conversations: [{'role':'system'|'user'|'assistant','content':str}, ...]
    Returns: (events, meta)  where events follow the unified schema used by score_webshop(...)
    """
    events: List[Dict[str,Any]] = []
    instruction = None
    reward_val  = None

    pending = None        # (type,arg) from assistant Action until we see the next user observation
    first_total = None

    for msg in conversations:
        role = msg.get("role","")
        text = str(msg.get("content",""))

        # capture Goal once (it's repeated each assistant turn)
        if instruction is None:
            mg = GOAL_RE.search(text)
            if mg: instruction = mg.group(1).strip()

        # assistant proposes an Action inside <turn> blocks
        if role == "assistant":
            m = ACTION_RE.search(text)
            if m:
                pending = normalize_action(m.group(1), m.group(2))
            continue

        # user returns the observation (and sometimes final score)
        if role == "user":
            # if this is the obs following an assistant Action, commit the event
            if pending:
                ev_type, ev_arg = pending
                ev = {"type": ev_type}
                if ev_type == "search":    ev["query"] = ev_arg
                elif ev_type == "open_item": ev["asin"] = ev_arg
                elif ev_type == "subpage": ev["which"] = ev_arg
                # attach total results if present on this observation
                if ev_type == "search":
                    mt = TOTRES_RE.search(text)
                    if mt and first_total is None:
                        first_total = int(mt.group(1))
                        ev["total_results"] = first_total
                events.append(ev)
                pending = None

            # optional: read reward
            ms = SCORE_RE.search(text)
            if ms:
                try: reward_val = float(ms.group(1))
                except: pass

    # constraint heuristic from instruction text
    constraints = 0
    instr_l = norm_lower(instruction or "")
    if re.search(r"\b(under|less than|lower than|at most|<=|\$)\b|\d+\.\d{2}", instr_l): constraints += 1
    if re.search(r"\b(black|white|red|blue|green|gold|rose gold|silver|beige|pink)\b", instr_l): constraints += 1
    if re.search(r"\b(\d+\s*(inch|in|oz|pack|pcs|piece|cm|mm))\b", instr_l): constraints += 1
    if re.search(r"\b(natural|lightweight|easy to carry|waterproof|wireless|noise cancelling|carbon fiber|4-tier)\b", instr_l): constraints += 1

    meta = {
        "instruction": instruction or "",
        "constraints_count": constraints,
        "reward": reward_val,
    }
    return events, meta

In [31]:
INSTR_RE = re.compile(
    r"(?is)\bInstruction\s*:\s*"
    r"(?:\[SEP\]\s*)?"             # optional opening [SEP]
    r"(.*?)"                       # capture the instruction text (non-greedy)
    r"(?:"                         # stop at one of:
    r"\s*\[SEP\]"                  #   closing [SEP]
    r"|\n\[[^\]]+\]"               #   a bracketed control like [Search], [Back to Search]
    r"|$"                          #   or end of string
    r")"
)

def _norm_space(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

def _norm_instr(s: str) -> str:
    return _norm_space((s or "").lower())

def parse_run_trace(path: str) -> tuple[str, list[tuple[str,str]], str]:
    msgs = json.load(open(path))
    run_id = pathlib.Path(path).stem
    # if you inserted meta with {"role":"meta","content":{"id":...}}, prefer that:
    if msgs and msgs[0].get("role") == "meta":
        rid = msgs[0]["content"].get("id")
        if rid:
            run_id = str(rid)

    instruction = None
    pending = None
    seq: list[tuple[str,str]] = []
    for m in msgs:
        role = m.get("role","")
        text = str(m.get("content",""))
        if role == "user":
            if instruction is None:
                mi = INSTR_RE.search(text)
                if mi:
                    instruction = mi.group(1).strip()
            if pending:
                seq.append(pending)
                pending = None
    return _norm_instr(instruction or ""), seq, run_id[6:], msgs

In [32]:
import re, math, json
from typing import List, Dict, Any, Tuple

# ---------- Shared helpers ----------
ASIN_RE = re.compile(r"\bB0[A-Z0-9]{8}\b", re.I)
TOTRES_RE = re.compile(r"Total results:\s*(\d+)", re.I)
INSTR_RE  = re.compile(r"(?is)\bInstruction\s*:\s*(?:\[SEP\]\s*)?(.*?)(?:\s*\[SEP\]|\n\[[^\]]+\]|$)")
THOUGHT_RE = re.compile(r"(?ims)^\s*Thought:\s*(.*?)(?=\n\s*Action:|$)")
ACTION_RE  = re.compile(r"(?im)^\s*Action:\s*(search|click|choose)\[(.+?)\]\s*$")
def norm_space(s:str)->str: return re.sub(r"\s+"," ", (s or "")).strip()
def norm_lower(s:str)->str: return norm_space(s).lower()

def normalize_action(verb: str, arg: str) -> Tuple[str, str]:
    v = (verb or "").lower().strip()
    if v == "choose": v = "click"
    a = (arg or "").strip()
    # map common controls
    low = a.lower()
    if v == "click":
        if low in {"buy now", "buy"}: return ("buy","")
        if low in {"next >", "next>"}: return ("nav","next")
        if low in {"< prev", "prev"}:  return ("nav","prev")
        if low in {"back to search", "back"}: return ("nav","back")
        if low in {"description", "features", "reviews"}: return ("subpage", low)
        # ASIN?
        m = ASIN_RE.search(a)
        if m: return ("open_item", m.group(0).upper())
        # option or other button
        return ("option", norm_lower(a))
    elif v == "search":
        return ("search", norm_lower(a))
    return (v, norm_lower(a))

# ---------- Parser A: structured (human/expert or scripted) ----------
def parse_structured(traj: Dict[str, Any]) -> Tuple[List[Dict], Dict]:
    """
    Returns (events, meta)
    events: [{type, query/asin/which, total_results?}, ...]
    meta:   {'instruction': str, 'constraints_count': int}
    """
    goal = traj.get("goal", {}) or {}
    instr = goal.get("instruction_text", "")
    constraints = 0
    if goal.get("price_upper") is not None: constraints += 1
    constraints += len(goal.get("attributes", []) or [])
    constraints += len(goal.get("goal_options", []) or [])

    events, first_total = [], None
    last_opts = {}

    for st in traj.get("steps", []):
        a = st.get("action", {}) or {}
        typ = a.get("type","")
        # search
        if typ == "search":
            q = a.get("args",{}).get("keywords") or []
            q = " ".join(map(str,q))
            ev = {"type":"search","query":norm_lower(q)}
            # if candidates list is provided for this step/state
            cand = (st.get("state") or {}).get("candidates")
            if isinstance(cand, list) and cand and first_total is None:
                first_total = len(cand)
                ev["total_results"] = first_total
            events.append(ev)
        # item open (from search or click)
        elif typ == "open_item":
            # sometimes ASIN not given, but state updated elsewhere
            # we still count an open
            events.append({"type":"open_item"})
        # buy
        elif typ == "buy":
            events.append({"type":"buy"})
        # option detection: when state.options grows
        opts = (st.get("state") or {}).get("options") or {}
        if len(opts) > len(last_opts):
            events.append({"type": "option"})
        last_opts = opts

    meta = {"instruction": instr, "constraints_count": constraints}
    return events, meta

# ---------- Parser B: HF-style dialogue (Action/Observation) ----------
def parse_hf_dialog(conversations: List[Dict[str, str]]) -> Tuple[List[Dict], Dict]:
    """
    Returns (events, meta). meta.constraints_count is a heuristic.
    """
    instruction = None
    events, pending = [], None
    first_total = None

    for msg in conversations:
        who = msg.get("from","")
        text = str(msg.get("value",""))

        if who == "gpt":
            m = ACTION_RE.search(text)
            if m:
                pending = normalize_action(m.group(1), m.group(2))
        elif who == "human":
            if instruction is None:
                mi = INSTR_RE.search(text)
                if mi: instruction = mi.group(1).strip()
            if pending:
                ev_type, ev_arg = pending
                ev = {"type":ev_type}
                if ev_type == "search": ev["query"] = ev_arg
                if ev_type == "open_item": ev["asin"] = ev_arg
                if ev_type == "subpage":   ev["which"] = ev_arg
                # parse total results from this observation (if present)
                if ev_type == "search":
                    m = TOTRES_RE.search(text)
                    if m and first_total is None:
                        first_total = int(m.group(1))
                        ev["total_results"] = first_total
                events.append(ev)
                pending = None

    # crude constraint heuristic from instruction
    constraints = 0
    instr_l = norm_lower(instruction or "")
    if re.search(r"\b(under|less than|lower than|at most|<=|\$)\b|\d+\.\d{2}", instr_l): constraints += 1  # price-ish
    # count a few attribute-y tokens (color/size/length)
    if re.search(r"\b(black|white|red|blue|green|gold|rose gold|silver|beige|pink)\b", instr_l): constraints += 1
    if re.search(r"\b(\d+\s*(inch|in|oz|pack|pcs|piece|cm|mm))\b", instr_l): constraints += 1
    if re.search(r"\b(natural|lightweight|easy to carry|waterproof|wireless|noise cancelling|4-tier)\b", instr_l): constraints += 1

    meta = {"instruction": instruction or "", "constraints_count": constraints}
    return events, meta

# ---------- Difficulty scorer ----------
W_WS = {
    "search": 1.0,
    "reformulate": 1.2,
    "open_item": 0.8,
    "paginate": 0.8,      # next/prev/back
    "subpage": 0.5,
    "option": 1.4,
    "unique_items": 0.5,
    "steps": 0.2,
    "total_results_log": 0.4,
    "constraints": 0.8,
}

def score_webshop(events: List[Dict], meta: Dict) -> Dict[str, Any]:
    n_search = 0
    queries = []
    n_open = 0
    n_sub = 0
    n_opt = 0
    n_pag = 0
    n_buy = 0
    asins = set()
    first_total = None

    for ev in events:
        t = ev.get("type")
        if t == "search":
            n_search += 1
            q = ev.get("query","")
            if q: queries.append(q)
            if first_total is None and "total_results" in ev:
                first_total = int(ev["total_results"])
        elif t == "open_item":
            n_open += 1
            if ev.get("asin"): asins.add(ev["asin"])
        elif t == "subpage":
            n_sub += 1
        elif t == "option":
            n_opt += 1
        elif t == "nav":
            n_pag += 1
        elif t == "buy":
            n_buy += 1

    # queries that are genuinely different (reformulations)
    distinct = []
    for q in queries:
        if not distinct or q != distinct[-1]:
            distinct.append(q)
    n_reform = max(0, len(distinct) - 1)

    # build the score
    score = (
        W_WS["search"] * n_search +
        W_WS["reformulate"] * n_reform +
        W_WS["open_item"] * n_open +
        W_WS["paginate"] * n_pag +
        W_WS["subpage"] * n_sub +
        W_WS["option"] * n_opt +
        W_WS["unique_items"] * len(asins) +
        W_WS["steps"] * len(events) +
        W_WS["total_results_log"] * math.log1p(first_total or 10) +
        W_WS["constraints"] * int(meta.get("constraints_count", 0))
    )
    # print(n_search ,n_reform , 
    #       n_open,n_pag ,n_sub, n_opt, len(asins), len(events), math.log1p(first_total or 10),int(meta.get("constraints_count", 0)))
    # print(score)
    return {
        "score": round(float(score), 3),
        "n_search": n_search,
        "n_reformulate": n_reform,
        "n_open_item": n_open,
        "n_subpage": n_sub,
        "n_option": n_opt,
        "n_paginate": n_pag,
        "n_unique_items": len(asins),
        "n_steps": len(events),
        "total_results_first": first_total,
        "constraints_count": int(meta.get("constraints_count", 0)),
        "bought": bool(n_buy),
    }


In [33]:
train_metadata = pd.read_csv("data/train_metadata.csv", index_col=0)

In [34]:
import re

# df_train = df_train[df_train.success == 1]
# df_train.reset_index(inplace=True, drop=True)
df_train["expert_difficulty"] = None
df_train["category"] = None

for idx, row in df_train.iterrows():
    file = row.trace_file
    match = re.search(r'_(\d+)\.json$', file)
    number = int(match.group(1))
    # if row.success == 1:
    file_path = PROMPT_BASE_DIR+file
    category = train_metadata["category"][number]
    
    instr, seq, json_id, msgs = parse_run_trace(file_path)
    
    # category = METADATA_DIR+file+"l"
    # with open(category) as f:
    #     json_list = list(f)
    #     category = ast.literal_eval(json_list[0])
    # category = category["category"]
    
    if category == "grocery":
        category = "food"
    if category == "garden":
        category = "furniture"
    df_train.at[idx, "category"] = category
    events, meta = parse_self_traces(msgs)
    difficulty = score_webshop(events, meta)['score'] 
    df_train.at[idx, 'expert_difficulty'] = difficulty

  category = train_metadata["category"][number]


In [35]:
df_expert.instruction[99]

'i need caxxa amber glass fine mist spray bottles, size 12 refillable containers, and price lower than 50.00 dollars'

In [36]:
df_train.category.unique()

array(['beauty', 'fashion', 'furniture', 'electronics', 'food'],
      dtype=object)

In [37]:
df_train.category.value_counts()

category
beauty         262
fashion        251
food           239
furniture      229
electronics    219
Name: count, dtype: int64

In [38]:
import ast

df_train["expert_traces"] = "training_set"
df_train["orig_trace"] = None


for idx, file in enumerate(df_train.trace_file):
    file_path = PROMPT_BASE_DIR+file
    instr, seq, json_id, msgs = parse_run_trace(file_path)
    df_train.at[idx, "orig_trace"] = msgs
    
    train_reward = df_train.iloc[idx].total_reward 
    mask = df_expert["instruction"].eq(instr)
    mask2 = df_eto["instruction"].eq(instr)
    reward = 0
    reward2 = 0
    if mask.any():
        i = df_expert.index[mask][0]          # index label
        # print("found in webshop experts:", instr, "at index", i)
        expert_trace = df_expert.at[i, "file"]  # safer than .values[0]
        reward = df_expert.at[i, "reward"]
        # if df_train.iloc[idx].total_reward <= df_expert.at[i, "reward"]:
        #     # print('better')
        #     df_train["expert_trace"] = expert_trace

    if mask2.any():
        i = df_eto.index[mask2][0]          # index label
        # print("found in webshop experts:", instr, "at index", i)
        # print(i, df)
        expert_trace2 = df_eto.at[i, "conversations"]  # safer than .values[0]
        reward2 = df_eto.at[i, "reward"]

    # print(reward)
    if train_reward < reward:
        # print(train_reward, reward)
        traj = ast.literal_eval(expert_trace)
        events, meta = parse_structured(traj)
        difficulty = score_webshop(events, meta)['score'] 
        df_train.at[idx, 'expert_difficulty'] = difficulty
        df_train.at[idx, 'expert_traces'] = "Human Expert" 
        # break
    elif train_reward < reward2:
        # print(train_reward, reward2)
        events, meta = parse_hf_dialog(expert_trace2)
        difficulty2 = score_webshop(events, meta)['score']        
        df_train.at[idx, 'expert_difficulty'] = difficulty2
        df_train.at[idx, 'expert_traces'] = "ETO"
        
# df_train["expert_trace"] = expert_traces
# df_train["expert_difficulty"] = expert_difficulty


In [39]:
len(df_train.dropna())

1200

In [40]:
df_train[df_train.success==1.0].expert_traces.value_counts()

expert_traces
training_set    718
Human Expert     27
ETO              17
Name: count, dtype: int64

In [41]:
df_train[df_train.total_reward==0].expert_difficulty.mean()

np.float64(17.820634703196344)

In [42]:
df_train[df_train.total_reward==1].category.value_counts()

category
beauty         87
fashion        81
electronics    69
food           62
furniture      53
Name: count, dtype: int64

In [43]:
df_train[df_train.total_reward==1].expert_difficulty.mean()

np.float64(6.453681818181809)

In [44]:
# df_train.to_csv("data/react_train_set_difficulty_scoring.csv", index=False)


# df_train.to_csv("data/state_train_difficulty_selfgenerated.csv", index=False)
df_train.to_csv("data/react_train_difficulty_selfgenerated.csv", index=False)


In [1]:
import os
import re
import ast
import json
import pathlib
import glob
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from typing import List, Dict, Any, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def remove_few_shot_examples(prompt: str) -> str:
    """
    Keeps the first line ('Interact with a household...') and removes the few-shot examples
    between 'Here are 2 examples:' and the last 'Your task is to:'.
    """
    # Extract the first line (system instruction)
    first_line, *rest = prompt.split("\n", 1)
    rest_text = rest[0] if rest else ""

    #########################################

    # Split by all occurrences of 'Your task is to:'
    parts = rest_text.split("\nWebShop \nInstruction:  \n")
    if len(parts) < 2:
        return prompt  # Nothing to strip

    # Keep only the last task and everything that follows
    cleaned = first_line.strip() + "\nWebShop \nInstruction:  \n" + parts[-1]

    return cleaned.strip()

In [3]:
def extract_whole_trace(example):
    trace = ast.literal_eval(example.orig_trace)
    # print(trace[1])
    # Prompt = system + initial user
    sys_msg  = next(m["content"] for m in trace if m["role"]=="system")
    usr_msg  = next(m["content"] for m in trace if m["role"]=="user")

    # print(usr_msg)
    prompt   = sys_msg + "\n" + usr_msg + "\n"
    # Completion = *everything else*, in order
    rest     = trace[2:]
    completion = "".join(m["content"] + "\n" for m in rest)
    # add end‐of‐sequence token if needed
    if not completion.endswith(""):
        completion += ""
        
    match = re.findall("Nothing happens.", completion)
    # if len(match) > 0:
    #     print(len(match))
    
    return {"prompt": prompt, "completion": completion, "nothing_occ":len(match)}

In [5]:
df = pd.read_csv("data/rag_train_set_difficulty_scoring.csv") #RAG DATASET
df.expert_traces.value_counts()

expert_traces
training_set    1098
Human Expert      70
ETO               32
Name: count, dtype: int64

In [4]:
# -*- coding: utf-8 -*-
import ast, json, re, pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# ------------------ 0) Config ------------------
# INPUT_CSV      = "data/train_set_difficulty_scoring.csv"
# INPUT_CSV      = "data/react_train_set_difficulty_scoring.csv"
INPUT_CSV = "data/state_train_difficulty_selfgenerated.csv" #WITH RAG!!
# INPUT_CSV = "data/react_train_difficulty_selfgenerated.csv" #WITH RAG!!

# MODEL_NAME     = "Qwen/Qwen2.5-14B-Instruct"
MODEL_NAME     = "Qwen/Qwen2.5-7B-Instruct"

USE_HANDPICKED = False
# KEEP_MODE options:
#   "first_assistant_only"
#   "last_assistant_only"
#   "every_user_assistant_pair"       # prompt = system + that single user only
#   "every_pair_with_history"         # NEW: prompt = full history up to each assistant
#   "full_episode_completion"
#   "midpoint_next_step"
#   "last_step_with_history"
KEEP_MODE      = "full_episode_completion"

MIDPOINT = "auto"   # for midpoint mode
TASK_SENTINEL  = "Here is the task"

# ------------------ 1) Load & basic filtering ------------------
df = pd.read_csv(INPUT_CSV)
df = df[df["success"] == 1.0]
df = df[df["total_reward"] >= 1].reset_index(drop=True)

# ------------------ 2) Helpers ------------------
def _json_or_ast_load(s: str):
    if isinstance(s, (list, dict)): return s
    s = s.strip()
    try: return json.loads(s)
    except Exception: return ast.literal_eval(s)

def strip_fewshot_from_system(sys_text: str) -> str:
    if not sys_text: return sys_text
    if TASK_SENTINEL in sys_text:
        head = sys_text.split("\n", 1)[0]
        tail = sys_text.split(TASK_SENTINEL, 1)[1]
        return (head + "\n\n" + TASK_SENTINEL + tail).strip()
    return sys_text.strip()

def apply_chat_prompt(messages):
    # Qwen chat template; ends right before assistant generation
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

def build_prompt_from_history(trace, upto_exclusive_idx: int):
    """Prompt = system + full (user/assistant) history strictly before upto_exclusive_idx."""
    sys_msg = next((m.get("content","") for m in trace if m.get("role")=="system"), "")
    sys_msg = strip_fewshot_from_system(sys_msg)
    msgs = []
    if sys_msg:
        msgs.append({"role": "system", "content": sys_msg})
    for i in range(upto_exclusive_idx):
        role = trace[i].get("role")
        if role in ("user", "assistant"):
            content = (trace[i].get("content") or "").strip()
            if content:
                msgs.append({"role": role, "content": content})
    return apply_chat_prompt(msgs)

def user_assistant_pairs(trace):
    """List of (user_idx, next_assistant_idx) pairs in chronological order."""
    pairs, n = [], len(trace)
    for i, m in enumerate(trace):
        if m.get("role") != "user": continue
        j = next((k for k in range(i+1, n) if trace[k].get("role")=="assistant"), None)
        if j is not None: pairs.append((i, j))
    return pairs

def split_prompt_completion_from_trace(trace, keep_mode="first_assistant_only"):
    """
    Returns list[{"prompt": str, "completion": str}]
    """
    sys_msg = next((m.get("content","") for m in trace if m.get("role")=="system"), "")
    sys_msg = strip_fewshot_from_system(sys_msg)

    # ---------- NEW: every_pair_with_history ----------
    if keep_mode == "every_pair_with_history":
        pairs = user_assistant_pairs(trace)
        out = []
        for ui, ai in pairs:
            # prompt: full history up to the assistant (exclusive)
            prompt = build_prompt_from_history(trace, upto_exclusive_idx=ai)
            asst = (trace[ai].get("content") or "").strip()
            if not asst: continue
            out.append({"prompt": prompt, "completion": asst})
        return out

    # existing: every_user_assistant_pair (prompt only that user, no earlier turns)
    if keep_mode == "every_user_assistant_pair":
        pairs = user_assistant_pairs(trace)
        out = []
        for ui, ai in pairs:
            usr = (trace[ui].get("content") or "").strip()
            asst = (trace[ai].get("content") or "").strip()
            if not usr or not asst: continue
            msgs = []
            if sys_msg: msgs.append({"role": "system", "content": sys_msg})
            msgs.append({"role": "user", "content": usr})
            prompt = apply_chat_prompt(msgs)
            out.append({"prompt": prompt, "completion": asst})
        return out

    # ----- single-sample modes below -----
    pairs = user_assistant_pairs(trace)
    if not pairs: return []

    try:
        first_user_idx = next(i for i, m in enumerate(trace) if m.get("role")=="user")
    except StopIteration:
        return []

    if keep_mode == "first_assistant_only":
        usr_msg = (trace[first_user_idx].get("content") or "").strip()
        if not usr_msg: return []
        prompt = apply_chat_prompt(([{"role":"system","content":sys_msg}] if sys_msg else []) +
                                   [{"role":"user","content":usr_msg}])
        remainder = trace[first_user_idx+1:]
        first_asst = next((m.get("content","").rstrip() for m in remainder if m.get("role")=="assistant"), "")
        return [{"prompt": prompt, "completion": first_asst or ""}]

    if keep_mode == "last_assistant_only":
        ui, ai = pairs[-1]
        usr = (trace[ui].get("content") or "").strip()
        asst = (trace[ai].get("content") or "").strip()
        if not usr or not asst: return []
        prompt = apply_chat_prompt(([{"role":"system","content":sys_msg}] if sys_msg else []) +
                                   [{"role":"user","content":usr}])
        return [{"prompt": prompt, "completion": asst}]

    if keep_mode == "last_step_with_history":
        ui, ai = pairs[-1]
        prompt = build_prompt_from_history(trace, upto_exclusive_idx=ai)
        asst = (trace[ai].get("content") or "").strip()
        if not asst: return []
        return [{"prompt": prompt, "completion": asst}]

    if keep_mode == "midpoint_next_step":
        idx = len(pairs)//2 if MIDPOINT == "auto" else int(MIDPOINT)
        idx = max(0, min(idx, len(pairs)-1))
        ui, ai = pairs[idx]
        prompt = build_prompt_from_history(trace, upto_exclusive_idx=ai)
        asst = (trace[ai].get("content") or "").strip()
        if not asst: return []
        return [{"prompt": prompt, "completion": asst}]

    # Fallback: full_episode_completion after first user
    usr_msg = (trace[first_user_idx].get("content") or "").strip()
    if not usr_msg: return []
    prompt = apply_chat_prompt(([{"role":"system","content":sys_msg}] if sys_msg else []) +
                               [{"role":"user","content":usr_msg}])
    remainder = trace[first_user_idx+1:]
    parts = []
    for m in remainder:
        role = m.get("role", "assistant")
        content = (m.get("content") or "").rstrip()
        if content: parts.append(f"{role}: {content}")
    return [{"prompt": prompt, "completion": "\n".join(parts).strip()}]

def calc_successes(df_in):
    if "category" in df_in.columns and "total_reward" in df_in.columns:
        grouped = df_in.groupby("category")["total_reward"]
        success_stats = grouped.agg(
            reward_average=lambda x: x.mean(),
            count="count"
        ).round(2)
        print("Average Reward and Count by category:")
        print(success_stats.to_string())

# ------------------ 3) Tokenizer ------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
try: tokenizer.truncation_side = "left"
except: pass
tokenizer.padding_side = "right"

# ------------------ 4) Build prompt/completion(s) ------------------
def row_to_records(row):
    trace = _json_or_ast_load(row.get("orig_trace", "[]"))
    if not isinstance(trace, list) or not trace: return []
    samples = split_prompt_completion_from_trace(trace, keep_mode=KEEP_MODE)
    recs = []
    for s in samples:
        prompt = s.get("prompt") or ""
        completion = s.get("completion") or ""
        back_click = "click[Back to Search]"
        # if (not completion) or ("Invalid action!" in completion) or (back_click in completion):
        if (not completion) or ("Invalid action!" in completion):
            continue
        recs.append({"prompt": prompt, "completion": completion})
    return recs

all_records = []
for _, row in df.iterrows():
    all_records.extend(row_to_records(row))

print("\n" + "*"*50)
print(f"Raw generated samples: {len(all_records)}")
print("*"*50 + "\n")

df_out = pd.DataFrame.from_records(all_records)

# ------------------ 5) Optional handpicked top-20 per category ------------------
if USE_HANDPICKED:
    raise NotImplementedError("Handpicked selection should be applied BEFORE expansion.")

print(f"Final usable examples: {len(df_out)}")
calc_successes(df)  # episode-level stats

# ------------------ 6) Build HF dataset ------------------
dataset = Dataset.from_pandas(df_out[["prompt", "completion"]].copy())
hf_ds = dataset



**************************************************
Raw generated samples: 380
**************************************************

Final usable examples: 380
Average Reward and Count by category:
             reward_average  count
category                          
beauty                  1.0    110
electronics             1.0     74
fashion                 1.0    115
food                    1.0     77
furniture               1.0     72


In [5]:
df_out.prompt[0]

'<|im_start|>system\nYou are a shopping assistant. Buy the closest item to the instruction provided. \n\nHere is the task.<|im_end|>\n<|im_start|>user\nWebShop \nInstruction:  \ni am interested in a high quality brush set, and price lower than 30.00 dollars \n[Search]<|im_end|>\n<|im_start|>assistant\n'

In [6]:
df_out.completion[0]

'assistant:  <turn>\nGoal: Buy a high quality brush set, and price lower than 30.00 dollars\nCurrent Location: Search Home Page\nCurrent Selection: None\nAction: search[high quality brush set <30]\nuser: \n[Back to Search] \nPage 1 (Total results: 50) \n[Next >] \n[B07HMRCPFQ] \nClearance! Lmtime High-End 18 pcs Makeup Brush Set tools Make-up Toiletry Kit Wool Make Up Brush Set (Purple) \n$2.99 \n[B07PWW6PKQ] \nBEILI Makeup Brushes 30Pcs Professional Makeup Brush Set Premium Pink Vegan Synthetic Kabuki Foundation Blending Brush Face Powder Blush Concealers Eye Shadows Make Up Brush Set (White/Rose Gold) \n$42.99 \n[B01G6AUR32] \nvela.yue Precise Crease Brush Eyes Tapered Blending Contour Makeup Tool \n$5.99\nassistant:  <turn>\nGoal: Buy a high quality brush set, and price lower than 30.00 dollars\nCurrent Location: Search Results Page, page 1\nCurrent Selection: None\nAction: click[B07HMRCPFQ]\nuser: \n[Back to Search] \n[< Prev] \ncolor [black][pink][purple]\nClearance! Lmtime High-E

In [26]:
# -*- coding: utf-8 -*-
import ast, json, re, pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# ------------------ 0) Config ------------------
INPUT_CSV      = "data/rag_train_set_difficulty_scoring.csv"
MODEL_NAME     = "Qwen/Qwen2.5-14B-Instruct"

USE_HANDPICKED = False
TASK_SENTINEL  = "Here is the task"

# ------------------ 1) Load & basic filtering ------------------
df = pd.read_csv(INPUT_CSV)
# Keep your filter if you want only good-ish runs; for STeP-style you can choose to relax this.
df = df[df["success"] == 1.0]
df = df[df["total_reward"] >= 1].reset_index(drop=True)

# ------------------ 2) Helpers ------------------
def _json_or_ast_load(s: str):
    if isinstance(s, (list, dict)): return s
    s = s.strip()
    try: return json.loads(s)
    except Exception: return ast.literal_eval(s)

def strip_fewshot_from_system(sys_text: str) -> str:
    if not sys_text: return sys_text
    if TASK_SENTINEL in sys_text:
        head = sys_text.split("\n", 1)[0]
        tail = sys_text.split(TASK_SENTINEL, 1)[1]
        return (head + "\n\n" + TASK_SENTINEL + tail).strip()
    return sys_text.strip()

# Simple heuristic for “incorrect” assistant steps (customize as you like)
def is_incorrect_assistant(content: str) -> bool:
    if not content: return True
    if "Invalid action!" in content: return True
    if "click[Back to Search]" in content: return True
    return False

# ------------------ 3) Tokenizer (Qwen chat) ------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
try: tokenizer.truncation_side = "left"
except: pass
tokenizer.padding_side = "right"

# ------------------ 4) Build ONE SAMPLE PER TRAJECTORY ------------------
# Emits:
#   text            -> full episode with "<|im_start|>role ... <|im_end|>"
#   assistant_spans -> list of {"start_char","end_char","correct":bool,"idx":int}
#
# NOTE: we *manually* compose Qwen-style blocks so we can track exact char offsets.
# This matches Qwen’s standard chat markers and works well for span bookkeeping.
def build_full_episode_with_spans(trace):
    # 1) Normalize + optional system trimming
    sys_msg = next((m.get("content","") for m in trace if m.get("role")=="system"), "")
    sys_msg = strip_fewshot_from_system(sys_msg)

    # 2) Rebuild the full message list in order, including system first (if present)
    msgs = []
    if sys_msg:
        msgs.append({"role": "system", "content": sys_msg})
    # keep original order of user/assistant turns
    for m in trace:
        role = m.get("role")
        if role not in ("user", "assistant"): 
            continue
        content = (m.get("content") or "").strip()
        msgs.append({"role": role, "content": content})

    # 3) Serialize with Qwen markers and record assistant spans
    text_parts = []
    assistant_spans = []
    cursor = 0
    assistant_idx = 0

    for m in msgs:
        role = m["role"]
        content = m["content"]

        # Qwen-style block
        prefix = f"<|im_start|>{role}\n"
        body   = content
        suffix = "<|im_end|>\n"

        block = prefix + body + suffix
        # span bookkeeping for assistant
        if role == "assistant":
            start_char = cursor + len(prefix)
            end_char   = start_char + len(body)
            assistant_spans.append({
                "start_char": start_char,
                "end_char":   end_char,
                "correct":    (not is_incorrect_assistant(content)),
                "idx":        assistant_idx,   # 0-based assistant turn index
            })
            assistant_idx += 1

        text_parts.append(block)
        cursor += len(block)

    full_text = "".join(text_parts)

    # (Optional) Add a final generation cue if you’d like the model to continue after the last block:
    # full_text += "<|im_start|>assistant\n"   # <-- only if you want to *generate* one more step

    return full_text, assistant_spans

def row_to_full_episode(row):
    trace = _json_or_ast_load(row.get("orig_trace", "[]"))
    if not isinstance(trace, list) or not trace:
        return None, None
    text, spans = build_full_episode_with_spans(trace)
    return text, spans

# Build dataset rows
records = []
for _, row in df.iterrows():
    text, spans = row_to_full_episode(row)
    if not text: 
        continue
    # Keep ALL steps (including incorrect) — do NOT filter them out.
    # You’ll decide masking in your collator later.
    records.append({
        "text": text,
        "assistant_spans": json.dumps(spans, ensure_ascii=False),
        # (Optional) carry episode-level metadata you might want later:
        # "category": row.get("category", None),
        # "total_reward": float(row.get("total_reward", 0)),
        # "success": int(row.get("success", 0)),
    })

df_out = pd.DataFrame.from_records(records)
print(f"Built trajectories: {len(df_out)}")

# ------------------ 5) Optional handpicked top-20 per category ------------------
if USE_HANDPICKED:
    raise NotImplementedError("Apply handpicked filtering BEFORE constructing full trajectories.")

# ------------------ 6) HF dataset ------------------
# One item per trajectory, with full text + per-assistant spans
dataset = Dataset.from_pandas(df_out[["text", "assistant_spans"]].copy())
hf_ds = dataset

# Quick peek
print(hf_ds[0]["text"][:500])
print(hf_ds[0]["assistant_spans"])


Built trajectories: 444
<|im_start|>system
You are a shopping assistant. Buy the closest item to the instruction provided. 

Here is the task.<|im_end|>
<|im_start|>assistant
reset<|im_end|>
<|im_start|>user
WebShop 
Instruction:  
i am interested in a high quality brush set, and price lower than 50.00 dollars 
[Search]<|im_end|>
<|im_start|>assistant
<turn>
Goal: Buy a high quality brush set, and price lower than 50.00 dollars
Current Location: Search Home Page
Current Selection: None
Action: search[high quality brush
[{"start_char": 151, "end_char": 156, "correct": true, "idx": 0}, {"start_char": 330, "end_char": 505, "correct": true, "idx": 1}, {"start_char": 1103, "end_char": 1276, "correct": true, "idx": 2}, {"start_char": 1607, "end_char": 1770, "correct": true, "idx": 3}]


In [30]:
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 403
})

In [36]:
df = pd.read_csv("data/train_set_difficulty_scoring.csv") #VANILLA DATASET
# df = pd.read_csv("data/react_train_set_difficulty_scoring.csv") #VANILLA DATASET
# df = pd.read_csv("data/rag_train_set_difficulty_scoring.csv") #RAG DATASET

use_handpicked = False
# use_handpicked = True

df = df[df.success==1.0]
df = df[df.total_reward>=1.0]
# df = df[df.num_of_steps<=9]
# df = df[df.num_of_steps>5]

df.reset_index(inplace=True, drop=True)

def calc_successes(df):
    # Group by env_type
    grouped = df.groupby("category")["total_reward"]
    
    # Compute success rate and count
    success_stats = grouped.agg(
        reward_average=lambda x: x.mean(),
        count="count"
    ).round(2)
    
    # Print
    print("Average Reward and Count by category:")
    print(success_stats.to_string())


#  Invalid action!
def extract_whole_trace(example):
    trace = ast.literal_eval(example.orig_trace)

    out_lines = []
    for m in trace:
        role = m["role"]
        content = m["content"].strip()
        if role == "system":
            out_lines.append(f"system: {content}")
        elif role == "user":
            out_lines.append(f"user: {content}")
        elif role == "assistant":
            out_lines.append(f"assistant: {content}")
        else:
            out_lines.append(f"{role}: {content}")  # fallback

    # First line(s) = prompt
    sys_msg = next(m["content"] for m in trace if m["role"]=="system")
    usr_msg = next(m["content"] for m in trace if m["role"]=="user")
    prompt  = f"{sys_msg.strip()}\n{usr_msg.strip()}\n"

    # The rest is the completion (assistant + further user turns)
    rest     = trace[2:]
    completion = "".join(
        (f"{m['role']}: {m['content'].strip()}\n") for m in rest
        # (f"{m['content'].strip()}\n") for m in rest
    )
    # completion = "".join(m["content"] + "\n" for m in rest)

    if not completion.endswith(""):
        completion += ""

    match = re.findall("Invalid action!", completion)
    # print(match)
    
    return {
        "prompt": prompt,
        "completion": completion,
        "nothing_occ": len(match)
    }
    
# Apply to your dataframe
df[["prompt", "completion", "nothing_occ"]] = df.apply(
    lambda row: pd.Series(extract_whole_trace(row)), axis=1
)

removed_examples = len(df[df.nothing_occ != 0])
df = df[df.nothing_occ == 0]
df.reset_index(drop=True, inplace=True)

print("\n"+"*"*50 + f"\nTotal removed examples (with Invalid Action!): {removed_examples}\n" + "*"*50)

if use_handpicked:
    # df = df[df.success==1].copy()
    df = df[df.total_reward >= 1.0]
    df['expert_difficulty'] = pd.to_numeric(df['expert_difficulty'], errors='coerce')
    df = df.dropna(subset=['expert_difficulty'])
    df.reset_index(inplace=True, drop=True)
    
    df = (
        df.sort_values(['category', 'expert_difficulty'], ascending=[True, False])
          .groupby('category', as_index=False, group_keys=False)
          .head(20)
    )
    df.reset_index(inplace=True, drop=True)
print(len(df))
# df[["prompt", "completion", "nothing_occ"]] = df.apply(lambda row: pd.Series(extract_whole_trace(row)), axis=1)

calc_successes(df)


**************************************************
Total removed examples (with Invalid Action!): 40
**************************************************
403
Average Reward and Count by category:
             reward_average  count
category                          
beauty                  1.0    102
electronics             1.0     70
fashion                 1.0     87
food                    1.0     69
furniture               1.0     75


In [37]:
df.prompt[0]



In [38]:
# Apply to entire DataFrame
df["prompt"] = df["prompt"].apply(remove_few_shot_examples)

In [8]:
df.prompt[0]

AttributeError: 'DataFrame' object has no attribute 'prompt'

In [40]:
# dataset = Dataset.from_pandas(df)
dataset = Dataset.from_pandas(df.drop(columns=["success", "total_reward"]))
dataset

Dataset({
    features: ['trace_file', 'num_of_steps', 'done', 'error', 'early_stop', 'total_prompt_token', 'total_in_token_accumulated', 'total_in_token_message_accumulated', 'total_out_token_accumulated', 'total_history_token', 'total_hint_prompt_token', 'total_hint_completion_token', 'expert_difficulty', 'category', 'expert_traces', 'orig_trace', 'prompt', 'completion', 'nothing_occ'],
    num_rows: 403
})

In [69]:
df.completion[0]

"user: WebShop \nInstruction:  \ni need a quick drying running shorts with drawstring closure. it should be light grayish blue in color, and price lower than 60.00 dollars \n[Search]\nassistant: <turn>\nGoal: Buy quick drying running shorts with drawstring closure, light grayish blue in color, and price lower than 60.00 dollars\nCurrent Location: Search Home Page\nCurrent Selection: None\nAction: search[quick drying running shorts light grayish blue]\nuser: [Back to Search] \nPage 1 (Total results: 50) \n[Next >] \n[B08DKM7Y6G] \nCRZ YOGA Women's Mid-Rise Quick Dry Workout Running Shorts Loose Drawstring Athletic Gym Shorts with Zip Pocket -3 Inches \n$28.0 \n[B08DR3PS84] \nCRZ YOGA Women Mid-Rise Workout Running Shorts with Liner 2 in 1 Athletic Sports Shorts with Zip Pocket- 3 inches \n$28.0 \n[B07D6ZRG7B] \nCRZ YOGA Quick-Dry Loose Running Shorts Mid Waist Sports Workout Shorts for Women Gym Athletic Shorts with Pocket - 2.5'' \n$28.0\nassistant: <turn>\nGoal: Buy quick drying runni

In [62]:
print("Number of Steps Statistics\n***************************")
print(f"""Min {df.num_of_steps.min()}
25th {df.num_of_steps.quantile(.25)}
Median {df.num_of_steps.median()}
Mean {df.num_of_steps.mean()}
75th {df.num_of_steps.quantile(.75)}
Max {df.num_of_steps.max()}
""")
# df.num_of_steps.median()

Number of Steps Statistics
***************************
Min 4
25th 4.0
Median 5.0
Mean 5.142493638676845
75th 6.0
Max 9



In [37]:
# 6. (Optional) Split into train/validation
# Here we do an 90/10 split; adjust 'seed' for reproducibility
dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_ds = dataset["train"]
val_ds   = dataset["test"]


In [2]:
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 1937
})

In [7]:
from unsloth import FastLanguageModel
from unsloth.chat_templates import train_on_responses_only

def get_model_and_tokenizer(
        model_name="Qwen/Qwen2.5-14B-Instruct", 
        max_seq_length=1024,
        load_in_4bit=True, 
        gpu_memory_utilization=0.99,
        lora_rank=16,
        random_state=1234,
        get_peft=True,
        # lora_alpha=64
    ):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name ,
        max_seq_length = max_seq_length,
        load_in_4bit = load_in_4bit, # False for LoRA 16bit
        load_in_8bit = False,    # A bit more accurate, uses 2x memory
        full_finetuning = False, # We have full finetuning now!
        fast_inference = False, # Enable vLLM fast inference
        max_lora_rank = lora_rank,
        gpu_memory_utilization = gpu_memory_utilization, # Reduce if out of memory
        # device_map="cuda" ,               # or {"": 0} to pin to GPU 0
        device_map="auto",               # pin everything to GPU 0
        # trust_remote_code=True,
    )

    if get_peft:
        model = FastLanguageModel.get_peft_model(
            model,
            r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
            target_modules = [
                "q_proj", "k_proj", "v_proj", "o_proj",
                "gate_proj", "up_proj", "down_proj",
            ], # Remove QKVO if out of memory
            # lora_alpha = lora_rank,
            lora_alpha = lora_rank*2,
            
            lora_dropout = 0.2,                    # add dropout (try 0.1–0.2)
            
            use_gradient_checkpointing = "unsloth", # Enable long context finetuning
            random_state = random_state,
        )
    
    return model, tokenizer


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-31 22:09:22 [__init__.py:235] Automatically detected platform cuda.


In [8]:
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForSeq2Seq

def get_sft_trainer(model, tokenizer, dataset, eval_dataset=None, training_steps=10, collator=None, type_field="text"):
    training_args = SFTConfig(
        dataset_text_field = type_field,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        # warmup_steps = 5,
        # warmup_steps = 10,
        warmup_ratio = 0.1,
        
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = training_steps,

        label_smoothing_factor=0.1,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        # learning_rate = 2e-5, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        
        # weight_decay = 0.01,
        weight_decay = 0.05, #increase to maybe improve
        
        lr_scheduler_type = "linear",
        # lr_scheduler_type = "cosine",
        seed = 42,
        
        # eval_strategy="steps",
        # eval_steps=3,                      # <-- frequency
        
        # report_to = "none", # Use this for WandB etc
    )
    
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        eval_dataset = eval_dataset, # Can set up evaluation!
        args = training_args,
        data_collator = collator,
    )
    return trainer

In [19]:
import torch
import gc

try:
    del model
    del tokenizer
except:
    pass

torch.cuda.empty_cache()
gc.collect()


294

In [9]:
# model, tokenizer = get_model_and_tokenizer(max_seq_length=16000)
# model, tokenizer = get_model_and_tokenizer()
model, tokenizer = get_model_and_tokenizer(model_name="Qwen/Qwen2.5-7B-Instruct")


==((====))==  Unsloth 2025.8.1: Fast Qwen2 patching. Transformers: 4.55.0. vLLM: 0.10.0.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.18it/s]
Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.2.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.8.1 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [10]:
# Prepare HF datasets
def merge_prompt_completion(example):
    # Make sure your completion already ends with EOS
    return {"text": example["prompt"] + example["completion"]}

try:
    hf_ds = dataset.map(
    merge_prompt_completion,
    remove_columns=[c for c in dataset["train"].column_names if c not in ("text",)],
    )
except:
    hf_ds = dataset.map(
        merge_prompt_completion,
        remove_columns=[c for c in dataset.column_names if c not in ("text",)],
    )



collator = None

Map: 100%|██████████| 380/380 [00:00<00:00, 27867.18 examples/s]


In [12]:
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 341
})

In [16]:
# hf_ds['text']

In [16]:
hf_ds["text"][0]

"<|im_start|>system\nYou are a shopping assistant. Buy the closest item to the instruction provided. \n\nHere is the task.<|im_end|>\n<|im_start|>user\nWebShop \nInstruction:  \ni am interested in a high quality brush set, and price lower than 30.00 dollars \n[Search]<|im_end|>\n<|im_start|>assistant\nassistant:  <turn>\nAction: search[high quality brush set < $30]\nuser: \n[Back to Search] \nPage 1 (Total results: 50) \n[Next >] \n[B07HMRCPFQ] \nClearance! Lmtime High-End 18 pcs Makeup Brush Set tools Make-up Toiletry Kit Wool Make Up Brush Set (Purple) \n$2.99 \n[B07PWW6PKQ] \nBEILI Makeup Brushes 30Pcs Professional Makeup Brush Set Premium Pink Vegan Synthetic Kabuki Foundation Blending Brush Face Powder Blush Concealers Eye Shadows Make Up Brush Set (White/Rose Gold) \n$42.99 \n[B01G6AUR32] \nvela.yue Precise Crease Brush Eyes Tapered Blending Contour Makeup Tool \n$5.99\nassistant:  \n<turn>\nAction: think[The only brush set under 30 dollars is B07HMRCPFQ. Let's check it out.]\nus

In [18]:
from unsloth.chat_templates import train_on_responses_only

# hf_ds = dataset

print("\n\n================\TRAINING SFT")

collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)

# sft_trainer = get_sft_trainer(model, tokenizer, hf_ds, training_steps=15, collator=collator, type_field="text")
sft_trainer = get_sft_trainer(model, tokenizer, hf_ds, training_steps=15, collator=collator, type_field="text")

sft_trainer = train_on_responses_only(
    sft_trainer,
    instruction_part = "<|im_start|>user",
    response_part    = "<|im_start|>assistant",
    force_match      = True,
)





Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 341/341 [00:00<00:00, 351.49 examples/s]
Map (num_proc=255): 100%|██████████| 341/341 [00:03<00:00, 86.23 examples/s] 


In [11]:
print("\n\n================\TRAINING SFT")

try:
    sft_trainer = get_sft_trainer(model, tokenizer, hf_ds["train"], eval_dataset=hf_ds["test"], training_steps=15, collator=collator)
except:
    sft_trainer = get_sft_trainer(model, tokenizer, hf_ds, training_steps=15, collator=collator)





Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 380/380 [00:01<00:00, 374.19 examples/s]


In [12]:
sft_trainer.train()
print("\n\n\n\nDone.")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 380 | Num Epochs = 1 | Total steps = 48
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176 of 7,655,986,688 (0.53% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,19.5971
2,18.9335
3,18.9509
4,18.3462
5,16.3221
6,14.829
7,14.0591
8,13.4077
9,13.3268
10,12.8114






Done.


In [13]:
# wherever you want to save:

# out_dir = "models/14B_vanilla_all"
# out_dir = "models/14B_vanilla_20handpicked"

# out_dir = "models/14B_rag_all"
# out_dir = "models/14B_rag_20handpicked"

# out_dir = "models/7B_vanilla_all"
# out_dir = "models/7B_vanilla_20handpicked"

# out_dir = "models/7B_rag_all"
# out_dir = "models/7B_rag_20handpicked"


# out_dir = "models/14B_vanilla_all"
# out_dir = "models/14B_vanilla_20handpicked"

# out_dir = "models/14B_vanilla_all_smoothing"



# out_dir = "models/14B_state_rag_self_smoothing"
# out_dir = "models/14B_react_rag_self_smoothing"



# out_dir = "models/7B_react_self_smoothing"
# out_dir = "models/7B_react_rag_self_smoothing"

out_dir = "models/7B_state_rag_self_smoothing"


# write model + tokenizer
model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)
sft_trainer.save_model(out_dir)               # saves model, config, trainer state

In [22]:
# model.save_lora("qwen3b-sft")

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct", trust_remote_code=True)
tokenizer  = AutoTokenizer.from_pretrained(out_dir, trust_remote_code=True)

# adapter_config is picked up automatically from output_dir
model = PeftModel.from_pretrained(base_model, out_dir)


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
from pathlib import Path, PurePath
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer


lora_path = Path("/rds/general/user/hai24/home/StateAct/alfworld_runs/models/14B_full_hard-abovemean_nofewshot_fixed")

# lora_path = Path("/rds/general/user/hai24/home/StateAct/alfworld_runs/models/14B_full_15steps_nofewshot_fixed_sanity")

assert lora_path.exists(), f"{lora_path} missing!"

base_id = "Qwen/Qwen2.5-14B-Instruct"     # ← swap to the correct base
model = AutoModelForCausalLM.from_pretrained(base_id,
                                             device_map="auto",
                                             torch_dtype="auto")
model = PeftModel.from_pretrained(model, lora_path, torch_dtype="auto")
print("Loaded OK ✓")


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


Loaded OK ✓
