In [None]:
# ============================================================
# 0. Setup
# ============================================================
!pip install -q pandas numpy sentence-transformers faiss-cpu scikit-learn matplotlib transformers accelerate peft bitsandbytes

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
import faiss

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# For reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ============================================================
# 1. Data loading and preprocessing (candidate pool)
# ============================================================

DATA_PATH = "marketing_sample_for_ebay_com-ebay_com_product__20210101_20210331__30k_data.csv"

df_raw = pd.read_csv(
    DATA_PATH,
    sep=",",
    engine="python",
    on_bad_lines="skip",
)

print("Raw shape:", df_raw.shape)
print(df_raw.head(2))
print(df_raw.columns.tolist())

df = df_raw.copy()
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# ---------- helpers ----------
def parse_price(x):
    if pd.isna(x):
        return np.nan
    s = str(x).replace("$", "").replace(",", "").strip()
    try:
        return float(s)
    except ValueError:
        return np.nan

def safe_str(x):
    return "" if pd.isna(x) else str(x)

def build_text(row):
  parts = [
      safe_str(row.get("Title", "")),
      safe_str(row.get("Manufacturer", "")),
      safe_str(row.get("Model Name", "")),
      safe_str(row.get("Internal Memory", "")),
      safe_str(row.get("Screen Size", "")),
      safe_str(row.get("Carrier", "")),
      safe_str(row.get("Color Category", "")),
      safe_str(row.get("Specifications", "")),
      safe_str(row.get("Model Num", "")) ]
  return " ".join([p for p in parts if p])

  def clean_text(txt):
    txt = str(txt).lower()
    txt = re.sub(r"[^\w\s]", " ", txt)
    txt = re.sub(r"\s+", " ", txt)
    return txt.strip()

  def parse_rating(x):
    if pd.isna(x):
      return np.nan
    s = str(x).strip().replace("%", "")
    try: return float(s)
    except ValueError:
      return np.nan

  # ---------- numeric target ----------
  df["price_num"] = df["Price"].apply(parse_price)
  df = df[df["price_num"].notnull() & (df["price_num"] > 0)]
  print("After price filter:", df.shape)

  # ---------- text fields ----------
  df["raw_text"] = df.apply(build_text, axis=1)
  df["text_clean"] = df["raw_text"].apply(clean_text)
  print(df[["raw_text", "text_clean"]].head())

  # ---------- extra numeric features ----------
  df["seller_rating_num"] = df.get("Seller Rating", np.nan).apply(parse_rating)
  df["seller_num_reviews_num"] = pd.to_numeric( df.get("Seller Num Of Reviews"), errors="coerce" )
  df["num_reviews_num"] = pd.to_numeric( df.get("Num Of Reviews"), errors="coerce" )
  df["num_ratings_num"] = pd.to_numeric( df.get("Number Of Ratings"), errors="coerce" )
  df["num_avg_rating_num"] = pd.to_numeric( df.get("Average Rating"), errors="coerce" )

  # ---------- unique id ----------
  if "Uniq Id" in df.columns: df["uniq_id"] = df["Uniq Id"].astype(str)
  else: df["uniq_id"] = df.index.astype(str)

  feature_cols = [ "uniq_id", "price_num","text_clean","seller_rating_num", "seller_num_reviews_num", "num_reviews_num", "num_ratings_num", "num_avg_rating_num" ]


df_final = df[feature_cols].reset_index(drop=True)
print("Final usable data:", df_final.shape)
df_final.head()

# ============================================================
# 2. Train / test split (candidate pool -> train, held‑out -> test)
# ============================================================

train_df, test_df = train_test_split(
    df_final,
    test_size=0.2,
    random_state=RANDOM_STATE,
)
print("Train:", train_df.shape, "Test:", test_df.shape)

train_df.to_csv("ebay_train_clean.csv", index=False)
test_df.to_csv("ebay_test_clean.csv", index=False)

# ============================================================
# 3. Text embeddings + FAISS index (similar product retrieval)
# ============================================================

EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

train_texts = train_df["text_clean"].tolist()
test_texts  = test_df["text_clean"].tolist()

train_embeddings = embed_model.encode(
    train_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)
test_embeddings = embed_model.encode(
    test_texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True,
)

print("Train embeddings:", train_embeddings.shape)
print("Test embeddings:", test_embeddings.shape)

dim = train_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(train_embeddings.astype("float32"))
print("FAISS index size:", index.ntotal)

def retrieve_similar_for_text(query_text, top_k=5):
    emb = embed_model.encode(
        [query_text],
        convert_to_numpy=True,
        normalize_embeddings=True
    ).astype("float32")
    D, I = index.search(emb, top_k)
    sims = train_df.iloc[I[0]][["uniq_id", "text_clean", "price_num"]].reset_index(drop=True)
    return sims

# ============================================================
# 4. Traditional baselines (KNN, Ridge, RF, GBM, MLP)
# ============================================================

y_train = train_df["price_num"].values
y_test  = test_df["price_num"].values

def eval_regressor(model_name, model, X, y_true):
    y_pred = model.predict(X)
    rmsle = np.sqrt(mean_squared_log_error(y_true, np.maximum(y_pred, 1e-6)))
    mae   = mean_absolute_error(y_true, y_pred)
    print(f"{model_name}: RMSLE={rmsle:.4f}, MAE={mae:.2f}")
    return {"name": model_name, "rmsle": rmsle, "mae": mae, "y_true": y_true, "y_pred": y_pred}

# --- KNN baseline ---
knn = KNeighborsRegressor(
    n_neighbors=5,
    metric="euclidean",
    weights="distance",
)
knn.fit(train_embeddings, y_train)
knn_results = eval_regressor("KNN", knn, test_embeddings, y_test)

# --- Ridge regression baseline ---
ridge = Ridge(alpha=1.0, random_state=RANDOM_STATE)
ridge.fit(train_embeddings, y_train)
ridge_results = eval_regressor("Ridge", ridge, test_embeddings, y_test)

# --- Random Forest baseline ---
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    random_state=RANDOM_STATE,
)
rf.fit(train_embeddings, y_train)
rf_results = eval_regressor("RandomForest", rf, test_embeddings, y_test)

# --- Gradient Boosting baseline ---
gbr = GradientBoostingRegressor(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=3,
    random_state=RANDOM_STATE,
)
gbr.fit(train_embeddings, y_train)
gbr_results = eval_regressor("GradientBoosting", gbr, test_embeddings, y_test)

# --- Multimodal DNN baseline ---
mm_dnn = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("mlp", MLPRegressor(
        hidden_layer_sizes=(256, 128),
        activation="relu",
        solver="adam",
        learning_rate_init=1e-3,
        max_iter=50,
        batch_size=256,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=5,
    )),
])

mm_dnn.fit(X_train_mm, y_train)
y_pred_mm = mm_dnn.predict(X_test_mm)

rmsle_mm = np.sqrt(mean_squared_log_error(y_test, np.maximum(y_pred_mm, 1e-6)))
mae_mm = mean_absolute_error(y_test, y_pred_mm)
print(f"Multimodal DNN: RMSLE={rmsle_mm:.4f}, MAE={mae_mm:.2f}")

multimodal_results = {
    "name": "Multimodal DNN",
    "rmsle": rmsle_mm,
    "mae": mae_mm,
    "y_true": y_test,
    "y_pred": y_pred_mm,
}
# ============================================================
# 5. Qwen setup (LLM pricing baselines)
# ============================================================

QWEN_MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

qwen_model = AutoModelForCausalLM.from_pretrained(
    QWEN_MODEL_NAME,
    torch_dtype="auto",
    device_map="auto",
)

def build_qwen_prompt_with_retrieval(target_text, similar_df):
    lines = []
    for i, row in similar_df.iterrows():
        lines.append(
            f"{i+1}. Title: {row['text_clean']}\n   Final price: {row['price_num']:.2f} USD"
        )
    context_block = "\n".join(lines)
    prompt = (
        "You are an assistant that suggests fair prices for second-hand electronics.\n"
        "Given a target product and similar products with their final prices, "
        "predict a reasonable price for the target product in USD.\n"
        "Return ONLY one positive number (no currency symbol, no explanation).\n\n"
        f"Target product:\n{target_text}\n\n"
        "Similar products:\n"
        f"{context_block}\n\n"
        "Answer:"
    )
    return prompt

def build_qwen_prompt_no_retrieval(target_text):
    prompt = (
        "You are an assistant that suggests fair prices for second-hand electronics.\n"
        "Estimate a reasonable market price in USD for the product below.\n"
        "Respond with only one positive number (no currency symbol, no explanation).\n\n"
        f"Product description:\n{target_text}\n\n"
        "Answer:"
    )
    return prompt

def qwen_generate(text, max_new_tokens=16, temperature=0.2):
    inputs = tokenizer(text, return_tensors="pt").to(qwen_model.device)
    with torch.no_grad():
        out = qwen_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
        )
    gen_ids = out[0, inputs["input_ids"].shape[1]:]
    resp = tokenizer.decode(gen_ids, skip_special_tokens=True)
    return resp.strip()

price_pattern = re.compile(r"\d+(\.\d+)?")

def qwen_price_with_confidence(prompt, n_samples=3, temperature=0.4, max_new_tokens=16):
    prices, outputs = [], []
    for _ in range(n_samples):
        text = qwen_generate(prompt, max_new_tokens=max_new_tokens, temperature=temperature)
        outputs.append(text)
        m = price_pattern.search(text)
        if not m:
            continue
        p = float(m.group())
        # simple sanity clamp; adjust bounds if needed
        if p <= 0 or p > 5000:
            continue
        prices.append(p)
    if not prices:
        return None, None, outputs
    mean_p = float(np.mean(prices))
    std_p  = float(np.std(prices))
    return mean_p, std_p, outputs

# ============================================================
# 6. Qwen with retrieval + confidence filtering (main method)
# ============================================================

def evaluate_qwen_with_retrieval(
    test_df,
    top_k=5,
    n_samples=5,
    temperature=0.3,
    max_items=100,
    std_threshold=30.0,
):
    y_true_list, y_pred_list, std_list = [], [], []
    for idx, row in test_df.head(max_items).iterrows():
        target_text = row["text_clean"]
        sims = retrieve_similar_for_text(target_text, top_k=top_k)
        prompt = build_qwen_prompt_with_retrieval(target_text, sims)
        mean_price, std_price, _ = qwen_price_with_confidence(
            prompt,
            n_samples=n_samples,
            temperature=temperature,
        )
        if mean_price is None:
            continue
        if std_price is not None and std_price > std_threshold:
            # low‑confidence -> drop
            continue
        y_true_list.append(row["price_num"])
        y_pred_list.append(mean_price)
        std_list.append(std_price)
    if not y_true_list:
        print("No valid Qwen predictions.")
        return None
    y_true = np.array(y_true_list)
    y_pred = np.array(y_pred_list)
    rmsle = np.sqrt(mean_squared_log_error(y_true, np.maximum(y_pred, 1e-6)))
    mae   = mean_absolute_error(y_true, y_pred)
    print(f"Qwen + retrieval on {len(y_true)} items: RMSLE={rmsle:.4f}, MAE={mae:.2f}")
    return {
        "rmsle": rmsle,
        "mae": mae,
        "y_true": y_true,
        "y_pred": y_pred,
        "std": np.array(std_list),
    }

qwen_ret_results = evaluate_qwen_with_retrieval(
    test_df,
    top_k=5,
    n_samples=5,
    temperature=0.3,
    max_items=100,
    std_threshold=30.0,
)

# ============================================================
# 7. Qwen no‑retrieval baseline
# ============================================================

def evaluate_qwen_no_retrieval(
    test_df,
    n_samples=5,
    temperature=0.3,
    max_items=100,
    std_threshold=30.0,
):
    y_true_list, y_pred_list, std_list = [], [], []
    for idx, row in test_df.head(max_items).iterrows():
        target_text = row["text_clean"]
        prompt = build_qwen_prompt_no_retrieval(target_text)
        mean_price, std_price, _ = qwen_price_with_confidence(
            prompt,
            n_samples=n_samples,
            temperature=temperature,
        )
        if mean_price is None:
            continue
        if std_price is not None and std_price > std_threshold:
            continue
        y_true_list.append(row["price_num"])
        y_pred_list.append(mean_price)
        std_list.append(std_price)
    if not y_true_list:
        print("No valid Qwen predictions.")
        return None
    y_true = np.array(y_true_list)
    y_pred = np.array(y_pred_list)
    rmsle = np.sqrt(mean_squared_log_error(y_true, np.maximum(y_pred, 1e-6)))
    mae   = mean_absolute_error(y_true, y_pred)
    print(f"Qwen NO retrieval on {len(y_true)} items: RMSLE={rmsle:.4f}, MAE={mae:.2f}")
    return {
        "rmsle": rmsle,
        "mae": mae,
        "y_true": y_true,
        "y_pred": y_pred,
        "std": np.array(std_list),
    }

qwen_no_ret_results = evaluate_qwen_no_retrieval(
    test_df,
    n_samples=5,
    temperature=0.3,
    max_items=100,
    std_threshold=30.0,
)

In [None]:
# ============================================================
# 8. Plots
# ============================================================

def plot_scatter(y_true, y_pred, title):
    plt.figure(figsize=(6, 6))
    errors = np.abs(y_true - y_pred)
    sc = plt.scatter(
        y_true, y_pred,
        c=errors,
        cmap="viridis",
        alpha=0.5,
        s=12,
        edgecolors="none",
    )
    max_v = max(y_true.max(), y_pred.max())
    plt.plot([0, max_v], [0, max_v], "r--", label="y = x")
    plt.xlabel("True price (USD)")
    plt.ylabel("Predicted price (USD)")
    plt.title(title)
    cbar = plt.colorbar(sc)
    cbar.set_label("Absolute error")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

def plot_error_hist(y_true, y_pred, title):
    errors = y_pred - y_true
    plt.figure(figsize=(6, 4))
    plt.hist(errors, bins=40, alpha=0.7, color="steelblue")
    plt.axvline(0, color="red", linestyle="--", label="No error")
    plt.xlabel("Prediction error (pred - true)")
    plt.ylabel("Count")
    plt.title(title + " – error distribution")
    plt.legend()
    plt.tight_layout()
    plt.show()

# ---- Call plots for key models ----

# KNN baseline
plot_scatter(knn_results["y_true"], knn_results["y_pred"], "KNN baseline")
plot_error_hist(knn_results["y_true"], knn_results["y_pred"], "KNN baseline")

# Best tree model you have (example: Random Forest; switch to GBM if better)
plot_scatter(rf_results["y_true"], rf_results["y_pred"], "RandomForest baseline")
plot_error_hist(rf_results["y_true"], rf_results["y_pred"], "RandomForest baseline")

# Qwen + retrieval (main method), if available
if qwen_ret_results is not None:
    plot_scatter(qwen_ret_results["y_true"], qwen_ret_results["y_pred"], "Qwen + retrieval")
    plot_error_hist(qwen_ret_results["y_true"], qwen_ret_results["y_pred"], "Qwen + retrieval")

# Qwen no-retrieval baseline, if available
if qwen_no_ret_results is not None:
    plot_scatter(qwen_no_ret_results["y_true"], qwen_no_ret_results["y_pred"], "Qwen NO retrieval")
    plot_error_hist(qwen_no_ret_results["y_true"], qwen_no_ret_results["y_pred"], "Qwen NO retrieval")
