In [None]:
# Colab Cell 1: Uninstall FastAI to avoid torch version conflict, then install all deps
!pip uninstall -y fastai fastcore
!pip install -q --upgrade \
    torch torchvision torchaudio \
    sentence-transformers \
    datasets \
    fsspec==2025.3.2 \
    gcsfs \
    huggingface_hub


Found existing installation: fastai 2.7.19
Uninstalling fastai-2.7.19:
  Successfully uninstalled fastai-2.7.19
Found existing installation: fastcore 1.7.29
Uninstalling fastcore-1.7.29:
  Successfully uninstalled fastcore-1.7.29


In [None]:
# Colab Cell 2: Imports + Config + Logging
import os
import random
import logging
from datetime import datetime

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datasets import load_dataset

# ─── CONFIG ─────────────────────────────────────────────────────────────────────
class Config:
    model_name    = "BAAI/bge-small-en-v1.5"
    dataset_name  = "wdc/products-2017"
    config_name   = "computers_large"              # required
    category      = "Computers_and_Accessories"    # required
    output_path   = "/content/bge-fine-tuned-wdc-products"
    train_batch_size = 32
    num_epochs    = 3
    max_seq_length= 512
    learning_rate = 2e-5
    train_val_split = 0.8
    sample_size   = 5000
    seed          = 42


def setup_logging():
    logging.basicConfig(
        format="%(asctime)s %(levelname)s %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO
    )

setup_logging()


In [None]:
# Colab Cell 3 (updated): Data loading & preprocessing
from datasets import load_dataset
import pandas as pd
import logging

def clean_text(text: str) -> str:
    if not text or pd.isna(text):
        return ""
    return str(text).replace('"', "").replace("@en", "").strip()

def create_product_text(row) -> str:
    parts = []
    for col, val in row.items():
        if pd.notna(val) and str(val).strip():
            parts.append(f"{col}: {clean_text(val)}")
    return "\n".join(parts) or "Unknown product"

def load_and_sample(cfg: Config):
    logging.info(f"Loading dataset {cfg.dataset_name} (config={cfg.config_name}, category={cfg.category})")
    # Pass only `name` and `category`—no trust_remote_code here
    ds = load_dataset(
        cfg.dataset_name,
        cfg.config_name,
        category=cfg.category
    )["train"]

    df = pd.DataFrame(ds)
    logging.info(f"Raw size = {len(df)} rows")
    if len(df) > cfg.sample_size:
        df = df.sample(cfg.sample_size, random_state=cfg.seed)
        logging.info(f"Sampled down to {len(df)} rows")
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""]
    logging.info(f"{len(df)} rows after dropping empties")
    return df


In [None]:
# Colab Cell 4: Pair creation (unchanged)
import random
from torch.utils.data import DataLoader
from sentence_transformers import InputExample

def make_pairs(df, cfg):
    idx_list = list(df.index)
    n = len(idx_list)

    # Positive (~20% of sample_size)
    pos_target = int(cfg.sample_size * 0.2)
    pos = []
    if "category" in df.columns:
        cats = df["category"].dropna().unique()
        per_cat = max(1, pos_target // len(cats))
        for cat in cats:
            cat_ids = df[df["category"] == cat].index.tolist()
            if len(cat_ids) < 2:
                continue
            for _ in range(per_cat):
                i, j = random.sample(cat_ids, 2)
                pos.append((i, j, 1.0))
    while len(pos) < pos_target:
        i, j = random.sample(idx_list, 2)
        pos.append((i, j, 1.0))

    # Negative (~40% of sample_size)
    neg_target = int(cfg.sample_size * 0.4)
    neg = []
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        while len(neg) < neg_target and len(cats) >= 2:
            c1, c2 = random.sample(cats, 2)
            i = random.choice(df[df["category"] == c1].index)
            j = random.choice(df[df["category"] == c2].index)
            neg.append((i, j, 0.0))
    while len(neg) < neg_target:
        i, j = random.sample(idx_list, 2)
        neg.append((i, j, 0.0))

    all_pairs = pos + neg
    random.shuffle(all_pairs)

    examples = [
        InputExample(
            texts=[df.loc[i].product_text, df.loc[j].product_text],
            label=score
        )
        for i, j, score in all_pairs
    ]

    split_idx = int(len(examples) * cfg.train_val_split)
    train_examples = examples[:split_idx]
    val_examples   = examples[split_idx:]
    logging.info(f"Pairs → {len(train_examples)} train / {len(val_examples)} val")
    return train_examples, val_examples


In [None]:
# Colab Cell 5: Training loop (load via JSON to avoid LocalFileSystem error)
import torch
import numpy as np
import random
import logging
import pandas as pd
from datasets import load_dataset
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

def train(cfg):
    # reproducibility
    torch.manual_seed(cfg.seed)
    np.random.seed(cfg.seed)
    random.seed(cfg.seed)

    # ─── Load remote JSONL directly from HF Hub (bypasses the Products2017Config bug)
    url = (
        f"https://huggingface.co/datasets/wdc/products-2017/resolve/main/"
        f"{cfg.config_name}/{cfg.category}_train.jsonl"
    )
    logging.info(f"Loading JSON dataset from:\n  {url}")
    ds = load_dataset("json", data_files={"train": url}, split=f"train[:{cfg.sample_size}]")
    df = pd.DataFrame(ds)
    logging.info(f"→ Loaded {len(df)} rows via JSON loader")

    # ─── Clean & prepare text
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""]
    logging.info(f"{len(df)} rows after dropping empty texts")

    # ─── Build train/validation examples
    train_examples, val_examples = make_pairs(df, cfg)
    logging.info(f"Prepared {len(train_examples)} train / {len(val_examples)} val examples")

    # ─── Model & training setup
    logging.info(f"Loading SentenceTransformer model {cfg.model_name}")
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length

    train_loader = DataLoader(train_examples, shuffle=True, batch_size=cfg.train_batch_size)
    train_loss   = losses.CosineSimilarityLoss(model)
    evaluator    = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name="val")

    warmup_steps = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Training for {cfg.num_epochs} epochs with {warmup_steps} warmup steps")

    model.fit(
        train_objectives=[(train_loader, train_loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warmup_steps,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True,
    )
    logging.info(f"✓ Training complete. Model saved to: {cfg.output_path}")

# Execute training
train(Config())


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetGenerationError: An error occurred while generating the dataset

In [None]:
# Colab Cell 5: Training loop (download JSONL via huggingface_hub + pandas)
import torch
import numpy as np
import random
import logging
import pandas as pd
from huggingface_hub import hf_hub_download
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

def train(cfg):
    # ─── Reproducibility ──────────────────────────────────────────────────────────
    torch.manual_seed(cfg.seed)
    np.random.seed(cfg.seed)
    random.seed(cfg.seed)

    # ─── Download the JSONL file locally via hf_hub_download ──────────────────────
    logging.info(f"Downloading {cfg.category}_train.jsonl from HF Hub")
    file_path = hf_hub_download(
        repo_id="wdc/products-2017",
        repo_type="dataset",  # <— ensure we fetch from a dataset repo
        filename=f"{cfg.config_name}/{cfg.category}_train.jsonl"
    )

    # ─── Load into pandas and sample ──────────────────────────────────────────────
    df = pd.read_json(file_path, lines=True)
    df = df.head(cfg.sample_size)
    logging.info(f"→ Loaded {len(df)} rows from JSONL")

    # ─── Clean & prepare text ─────────────────────────────────────────────────────
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""]
    logging.info(f"{len(df)} rows after dropping empty texts")

    # ─── Build train/validation examples ─────────────────────────────────────────
    train_examples, val_examples = make_pairs(df, cfg)
    logging.info(f"Prepared {len(train_examples)} train / {len(val_examples)} val examples")

    # ─── Model & training setup ───────────────────────────────────────────────────
    logging.info(f"Loading SentenceTransformer model {cfg.model_name}")
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length

    train_loader = DataLoader(train_examples, shuffle=True, batch_size=cfg.train_batch_size)
    train_loss   = losses.CosineSimilarityLoss(model)
    evaluator    = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, name="val")

    warmup_steps = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Training for {cfg.num_epochs} epochs with {warmup_steps} warmup steps")

    model.fit(
        train_objectives=[(train_loader, train_loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warmup_steps,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True,
    )
    logging.info(f"✓ Training complete. Model saved to: {cfg.output_path}")

# Kick off training
train(Config())


EntryNotFoundError: 404 Client Error. (Request ID: Root=1-680c8814-08b5caf34a61b50945e6c626;94bc5367-5896-4696-9687-c8b0eb836018)

Entry Not Found for url: https://huggingface.co/datasets/wdc/products-2017/resolve/main/computers_large/Computers_and_Accessories_train.jsonl.

In [None]:
# Install the essentials
!pip install -q --upgrade \
    torch torchvision torchaudio \
    sentence-transformers \
    pandas


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.[0m[31m
[0m

In [None]:
import os
import random
import logging
from datetime import datetime

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    LoggingHandler
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# ─── CONFIG ─────────────────────────────────────────────────────────────────────
class Config:
    model_name       = "BAAI/bge-small-en-v1.5"
    config_name      = "computers_large"
    category         = "Computers_and_Accessories"
    sample_size      = 5000
    train_batch_size = 32
    num_epochs       = 3
    max_seq_length   = 512
    learning_rate    = 2e-5
    train_val_split  = 0.8
    output_path      = "/content/bge-fine-tuned-wdc-products"
    seed             = 42

# ─── LOGGING SETUP ──────────────────────────────────────────────────────────────
logging.basicConfig(
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)

# ─── GPU CHECK ─────────────────────────────────────────────────────────────────
logging.info(f"GPU available: {torch.cuda.is_available()}, "
             f"Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}")


In [None]:
def clean_text(text):
    if not text or pd.isna(text):
        return ""
    return str(text).replace('"', "").replace("@en", "").strip()

def create_product_text(row):
    parts = []
    for col, val in row.items():
        if pd.notna(val) and str(val).strip():
            parts.append(f"{col}: {clean_text(val)}")
    return "\n".join(parts) or "Unknown product"

def load_data(cfg: Config) -> pd.DataFrame:
    # Using the GitHub mirror raw JSONL to avoid HF caching issues
    url = (
        f"https://raw.githubusercontent.com/datasets/wdc-products-2017/main/"
        f"{cfg.config_name}/{cfg.category}_train.jsonl"
    )
    logging.info(f"Loading JSONL from:\n  {url}")
    df = pd.read_json(url, lines=True)

    # Sample & reset index
    if len(df) > cfg.sample_size:
        df = df.sample(cfg.sample_size, random_state=cfg.seed).reset_index(drop=True)
        logging.info(f"Sampled down to {len(df)} rows")
    else:
        df = df.reset_index(drop=True)
        logging.info(f"Loaded {len(df)} rows")

    # Create text field & drop empty
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""].reset_index(drop=True)
    logging.info(f"{len(df)} rows after cleaning empty texts")
    return df


In [None]:
def make_pairs(df: pd.DataFrame, cfg: Config):
    random.seed(cfg.seed)
    idx_list = df.index.tolist()
    pos_target = int(cfg.sample_size * 0.2)
    neg_target = int(cfg.sample_size * 0.4)

    # ── Positive Pairs ───────────────────────────────────────────────
    pos = []
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        per_cat = max(1, pos_target // max(1, len(cats)))
        for cat in cats:
            ids = df[df["category"] == cat].index.tolist()
            if len(ids) < 2: continue
            for _ in range(per_cat):
                i, j = random.sample(ids, 2)
                pos.append((i, j, 1.0))
    while len(pos) < pos_target:
        i, j = random.sample(idx_list, 2)
        pos.append((i, j, 1.0))

    # ── Negative Pairs ───────────────────────────────────────────────
    neg = []
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        while len(neg) < neg_target and len(cats) > 1:
            c1, c2 = random.sample(cats, 2)
            ids1 = df[df["category"] == c1].index.tolist()
            ids2 = df[df["category"] == c2].index.tolist()
            if not ids1 or not ids2: continue
            i = random.choice(ids1)
            j = random.choice(ids2)
            neg.append((i, j, 0.0))
    while len(neg) < neg_target:
        i, j = random.sample(idx_list, 2)
        neg.append((i, j, 0.0))

    # ── Build InputExamples & split ──────────────────────────────────
    all_pairs = pos + neg
    random.shuffle(all_pairs)
    examples = [
        InputExample(texts=[df.loc[i].product_text, df.loc[j].product_text], label=score)
        for i, j, score in all_pairs
    ]
    split = int(len(examples) * cfg.train_val_split)
    train_ex = examples[:split]
    val_ex   = examples[split:]
    logging.info(f"Pairs → {len(train_ex)} train / {len(val_ex)} val examples")
    return train_ex, val_ex


In [None]:
def train_and_save(cfg: Config):
    # ─── Load & prepare data ──────────────────────────────────────────────
    df = load_data(cfg)
    train_ex, val_ex = make_pairs(df, cfg)
    if not train_ex:
        raise RuntimeError("No training examples—check your data loading.")

    # ─── Model & DataLoader setup ────────────────────────────────────────
    logging.info(f"Loading model {cfg.model_name}")
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length

    train_loader = DataLoader(train_ex, shuffle=True, batch_size=cfg.train_batch_size)
    train_loss   = losses.CosineSimilarityLoss(model)
    evaluator    = EmbeddingSimilarityEvaluator.from_input_examples(val_ex, name="val")

    warmup_steps = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Starting training for {cfg.num_epochs} epochs, warmup={warmup_steps}")

    # ─── Fine-tune! ───────────────────────────────────────────────────────
    model.fit(
        train_objectives=[(train_loader, train_loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warmup_steps,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True
    )
    logging.info(f"✓ Training complete. Model output at: {cfg.output_path}")

# Kick it off
train_and_save(Config())


HTTPError: HTTP Error 404: Not Found

In [None]:
# Single Colab Cell: Install, import, fine‐tune, and back up

# 1. Install dependencies
!pip install -q --upgrade torch torchvision torchaudio sentence-transformers pandas huggingface_hub

# 2. Imports
import os, random, logging
import pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from huggingface_hub import snapshot_download
from google.colab import drive

# 3. Configuration & Logging
class Config:
    model_name       = "BAAI/bge-small-en-v1.5"
    config_name      = "computers_large"
    category         = "Computers_and_Accessories"
    sample_size      = 5000
    train_batch_size = 32
    num_epochs       = 3
    max_seq_length   = 512
    learning_rate    = 2e-5
    train_val_split  = 0.8
    output_path      = "/content/bge-fine-tuned-wdc-products"
    seed             = 42
    cache_dir        = "/content/wdc_cache"

logging.basicConfig(
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)
cfg = Config()
logging.info(f"GPU available: {torch.cuda.is_available()}, device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}")

# 4. Data loading and cleaning
def clean_text(text):
    if not text or pd.isna(text):
        return ""
    return str(text).replace('"', "").replace("@en", "").strip()

def create_product_text(row):
    parts = []
    for col, val in row.items():
        if pd.notna(val) and str(val).strip():
            parts.append(f"{col}: {clean_text(val)}")
    return "\n".join(parts) or "Unknown product"

def load_data(cfg):
    logging.info("Downloading dataset snapshot…")
    local_dir = snapshot_download(
        repo_id="wdc/products-2017",
        repo_type="dataset",
        cache_dir=cfg.cache_dir
    )
    path = os.path.join(local_dir, cfg.config_name, f"{cfg.category}_train.jsonl")
    logging.info(f"Reading JSONL from {path}")
    df = pd.read_json(path, lines=True)
    if len(df) > cfg.sample_size:
        df = df.sample(cfg.sample_size, random_state=cfg.seed).reset_index(drop=True)
        logging.info(f"Sampled to {len(df)} rows")
    else:
        df = df.reset_index(drop=True)
        logging.info(f"Loaded {len(df)} rows")
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""].reset_index(drop=True)
    logging.info(f"{len(df)} rows after cleaning")
    return df

# 5. Pair creation
def make_pairs(df, cfg):
    random.seed(cfg.seed)
    idx = df.index.tolist()
    pos_target = int(cfg.sample_size * 0.2)
    neg_target = int(cfg.sample_size * 0.4)

    # positives
    pos = []
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        per_cat = max(1, pos_target // max(1, len(cats)))
        for cat in cats:
            ids = df[df["category"] == cat].index.tolist()
            if len(ids) < 2: continue
            for _ in range(per_cat):
                i,j = random.sample(ids, 2)
                pos.append((i,j,1.0))
    while len(pos) < pos_target:
        i,j = random.sample(idx,2)
        pos.append((i,j,1.0))

    # negatives
    neg = []
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        while len(neg) < neg_target and len(cats)>1:
            c1,c2 = random.sample(cats,2)
            a = df[df["category"]==c1].index.tolist()
            b = df[df["category"]==c2].index.tolist()
            if not a or not b: continue
            neg.append((random.choice(a), random.choice(b), 0.0))
    while len(neg) < neg_target:
        i,j = random.sample(idx,2)
        neg.append((i,j,0.0))

    # build examples & split
    all_pairs = pos + neg
    random.shuffle(all_pairs)
    examples = [InputExample(texts=[df.loc[i].product_text, df.loc[j].product_text], label=label)
                for i,j,label in all_pairs]
    split = int(len(examples) * cfg.train_val_split)
    train_ex, val_ex = examples[:split], examples[split:]
    logging.info(f"Built {len(train_ex)} train and {len(val_ex)} val examples")
    return train_ex, val_ex

# 6. Training
def train_and_save(cfg):
    df = load_data(cfg)
    train_ex, val_ex = make_pairs(df, cfg)
    if not train_ex:
        raise RuntimeError("No training examples!")
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length
    train_loader = DataLoader(train_ex, shuffle=True, batch_size=cfg.train_batch_size)
    loss = losses.CosineSimilarityLoss(model)
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_ex, name="val")
    warmup = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Training {cfg.num_epochs} epochs, warmup={warmup}")
    model.fit(
        train_objectives=[(train_loader, loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warmup,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True
    )
    logging.info(f"Model saved to {cfg.output_path}")

train_and_save(cfg)

# 7. Mount Drive & backup
drive.mount('/content/drive', force_remount=True)
dst = os.path.join("/content/drive/MyDrive", os.path.basename(cfg.output_path))
!cp -r "{cfg.output_path}" "{dst}"
logging.info(f"Model backup to Google Drive at: {dst}")



Fetching 39 files:   0%|          | 0/39 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

train_medium.json.gz:   0%|          | 0.00/2.51M [00:00<?, ?B/s]

test.json.gz:   0%|          | 0.00/662k [00:00<?, ?B/s]

train_small.json.gz:   0%|          | 0.00/896k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

train_large.json.gz:   0%|          | 0.00/9.70M [00:00<?, ?B/s]

valid_large.json.gz:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

train_xlarge.json.gz:   0%|          | 0.00/21.3M [00:00<?, ?B/s]

test.json.gz:   0%|          | 0.00/440k [00:00<?, ?B/s]

valid_medium.json.gz:   0%|          | 0.00/615k [00:00<?, ?B/s]

train_large.json.gz:   0%|          | 0.00/10.7M [00:00<?, ?B/s]

valid_small.json.gz:   0%|          | 0.00/240k [00:00<?, ?B/s]

train_medium.json.gz:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

valid_xlarge.json.gz:   0%|          | 0.00/5.34M [00:00<?, ?B/s]

valid_large.json.gz:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

train_small.json.gz:   0%|          | 0.00/914k [00:00<?, ?B/s]

valid_medium.json.gz:   0%|          | 0.00/661k [00:00<?, ?B/s]

train_xlarge.json.gz:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

products-2017.py:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

valid_small.json.gz:   0%|          | 0.00/212k [00:00<?, ?B/s]

valid_xlarge.json.gz:   0%|          | 0.00/5.48M [00:00<?, ?B/s]

test.json.gz:   0%|          | 0.00/471k [00:00<?, ?B/s]

train_medium.json.gz:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

train_large.json.gz:   0%|          | 0.00/8.75M [00:00<?, ?B/s]

train_small.json.gz:   0%|          | 0.00/758k [00:00<?, ?B/s]

valid_large.json.gz:   0%|          | 0.00/2.16M [00:00<?, ?B/s]

train_xlarge.json.gz:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

valid_medium.json.gz:   0%|          | 0.00/545k [00:00<?, ?B/s]

valid_small.json.gz:   0%|          | 0.00/194k [00:00<?, ?B/s]

test.json.gz:   0%|          | 0.00/515k [00:00<?, ?B/s]

valid_xlarge.json.gz:   0%|          | 0.00/4.18M [00:00<?, ?B/s]

train_large.json.gz:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

train_small.json.gz:   0%|          | 0.00/867k [00:00<?, ?B/s]

train_medium.json.gz:   0%|          | 0.00/2.53M [00:00<?, ?B/s]

valid_large.json.gz:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

train_xlarge.json.gz:   0%|          | 0.00/22.3M [00:00<?, ?B/s]

valid_small.json.gz:   0%|          | 0.00/214k [00:00<?, ?B/s]

valid_xlarge.json.gz:   0%|          | 0.00/5.46M [00:00<?, ?B/s]

valid_medium.json.gz:   0%|          | 0.00/624k [00:00<?, ?B/s]

  df = pd.read_json(path, lines=True)


ValueError: Expected object or value

In [None]:
# Single Colab Cell: Install, train on WDC Products 2017, and back up to Drive

# 1️⃣ Install dependencies
!pip install -q --upgrade torch torchvision torchaudio sentence-transformers pandas huggingface_hub

# 2️⃣ Imports & Config
import os, random, logging
import pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from huggingface_hub import snapshot_download
from google.colab import drive

class Config:
    model_name       = "BAAI/bge-small-en-v1.5"
    config_name      = "computers_large"
    sample_size      = 5000
    train_batch_size = 32
    num_epochs       = 3
    max_seq_length   = 512
    learning_rate    = 2e-5
    train_val_split  = 0.8
    output_path      = "/content/bge-fine-tuned-wdc-products"
    seed             = 42
    cache_dir        = "/content/wdc_cache"

cfg = Config()
logging.basicConfig(
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)
logging.info(f"GPU available: {torch.cuda.is_available()}, device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}")

# 3️⃣ Load & clean data
def clean_text(text):
    if not text or pd.isna(text): return ""
    return str(text).replace('"', "").replace("@en", "").strip()

def create_product_text(row):
    parts = []
    for col, val in row.items():
        if pd.notna(val) and str(val).strip():
            parts.append(f"{col}: {clean_text(val)}")
    return "\n".join(parts) or "Unknown product"

def load_data(cfg):
    logging.info("Downloading dataset snapshot…")
    local_dir = snapshot_download(
        repo_id="wdc/products-2017",
        repo_type="dataset",
        cache_dir=cfg.cache_dir
    )
    logging.info(f"Snapshot at {local_dir}")
    # Find the JSONL (or gz) train file anywhere under local_dir
    train_file = None
    for root, _, files in os.walk(local_dir):
        for fn in files:
            lower = fn.lower()
            if lower.endswith("_train.jsonl") or lower.endswith("_train.jsonl.gz") or lower.endswith("_train.json"):
                train_file = os.path.join(root, fn)
                break
        if train_file: break
    if not train_file:
        raise FileNotFoundError("Could not locate a train JSONL/GZ file in snapshot.")
    logging.info(f"Reading train file: {train_file}")
    # Read via pandas
    if train_file.endswith(".gz"):
        df = pd.read_json(train_file, lines=True, compression="gzip")
    else:
        df = pd.read_json(train_file, lines=True)
    # Sample & clean
    if len(df) > cfg.sample_size:
        df = df.sample(cfg.sample_size, random_state=cfg.seed).reset_index(drop=True)
    else:
        df = df.reset_index(drop=True)
    logging.info(f"Loaded {len(df)} rows")
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""].reset_index(drop=True)
    logging.info(f"{len(df)} rows after cleaning")
    return df

# 4️⃣ Make train/val pairs
def make_pairs(df, cfg):
    random.seed(cfg.seed)
    idx = df.index.tolist()
    pos_t = int(cfg.sample_size * 0.2)
    neg_t = int(cfg.sample_size * 0.4)
    pos, neg = [], []
    # Positive by same category if exists
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        per  = max(1, pos_t // max(1, len(cats)))
        for c in cats:
            ids = df[df["category"] == c].index.tolist()
            if len(ids) < 2: continue
            for _ in range(per):
                i,j = random.sample(ids,2)
                pos.append((i,j,1.0))
    while len(pos) < pos_t:
        i,j = random.sample(idx,2)
        pos.append((i,j,1.0))
    # Negative by different categories
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        while len(neg) < neg_t and len(cats) > 1:
            c1,c2 = random.sample(cats,2)
            a = df[df["category"]==c1].index.tolist()
            b = df[df["category"]==c2].index.tolist()
            if not a or not b: continue
            neg.append((random.choice(a), random.choice(b), 0.0))
    while len(neg) < neg_t:
        i,j = random.sample(idx,2)
        neg.append((i,j,0.0))
    pairs = pos + neg
    random.shuffle(pairs)
    examples = [ InputExample(texts=[df.loc[i].product_text, df.loc[j].product_text], label=lab)
                 for i,j,lab in pairs ]
    split = int(len(examples) * cfg.train_val_split)
    train_ex, val_ex = examples[:split], examples[split:]
    logging.info(f"Pairs → {len(train_ex)} train / {len(val_ex)} val")
    return train_ex, val_ex

# 5️⃣ Train & save
def train_and_save(cfg):
    df = load_data(cfg)
    train_ex, val_ex = make_pairs(df, cfg)
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length
    train_loader = DataLoader(train_ex, shuffle=True, batch_size=cfg.train_batch_size)
    loss        = losses.CosineSimilarityLoss(model)
    evaluator   = EmbeddingSimilarityEvaluator.from_input_examples(val_ex, name="val")
    warm_steps  = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Training {cfg.num_epochs} epochs, warmup_steps={warm_steps}")
    model.fit(
        train_objectives=[(train_loader, loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warm_steps,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True
    )
    logging.info(f"Model saved to {cfg.output_path}")

train_and_save(cfg)

# 6️⃣ Backup to Drive
drive.mount('/content/drive', force_remount=True)
dest = os.path.join("/content/drive/MyDrive", os.path.basename(cfg.output_path))
!cp -r "{cfg.output_path}" "{dest}"
logging.info(f"Model backed up to {dest}")


Fetching 39 files:   0%|          | 0/39 [00:00<?, ?it/s]

FileNotFoundError: Could not locate a train JSONL/GZ file in snapshot.

In [None]:
# Single Colab Cell: Install, train on WDC Products 2017, and back up to Drive

# 1️⃣ Install dependencies
!pip install -q --upgrade torch torchvision torchaudio sentence-transformers pandas huggingface_hub datasets

# 2️⃣ Imports & Config
import os, random, logging, glob
import pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from huggingface_hub import snapshot_download
from google.colab import drive
from datasets import load_dataset

class Config:
    model_name       = "BAAI/bge-small-en-v1.5"
    config_name      = "computers_large"
    sample_size      = 5000
    train_batch_size = 32
    num_epochs       = 3
    max_seq_length   = 512
    learning_rate    = 2e-5
    train_val_split  = 0.8
    output_path      = "/content/bge-fine-tuned-wdc-products"
    seed             = 42
    cache_dir        = "/content/wdc_cache"

cfg = Config()
logging.basicConfig(
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)
logging.info(f"GPU available: {torch.cuda.is_available()}, device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}")

# 3️⃣ Load & clean data
def clean_text(text):
    if not text or pd.isna(text): return ""
    return str(text).replace('"', "").replace("@en", "").strip()

def create_product_text(row):
    parts = []
    for col, val in row.items():
        if pd.notna(val) and str(val).strip():
            parts.append(f"{col}: {clean_text(val)}")
    return "\n".join(parts) or "Unknown product"

def load_data(cfg):
    try:
        # Method 1: Try using datasets library directly
        logging.info("Attempting to load dataset via datasets library...")
        try:
            dataset = load_dataset("wdc/products-2017")
            if 'train' in dataset:
                df = pd.DataFrame(dataset['train'])
                logging.info(f"Successfully loaded dataset via datasets library with {len(df)} rows")
            else:
                # Try to get first split available
                first_split = list(dataset.keys())[0]
                df = pd.DataFrame(dataset[first_split])
                logging.info(f"Loaded first available split '{first_split}' with {len(df)} rows")

        except Exception as e:
            logging.warning(f"Could not load via datasets: {e}")
            # Fall back to manual download
            logging.info("Falling back to manual snapshot download...")

            local_dir = snapshot_download(
                repo_id="wdc/products-2017",
                repo_type="dataset",
                cache_dir=cfg.cache_dir
            )
            logging.info(f"Snapshot at {local_dir}")

            # Debug: List files in snapshot directory
            logging.info("Listing files in snapshot directory:")
            for root, dirs, files in os.walk(local_dir):
                for f in files:
                    logging.info(f"- {os.path.join(root, f)}")

            # Search for any potential data files
            train_file = None
            data_files = []

            # Look for jsonl, json, or gz files
            for ext in ['*.jsonl', '*.jsonl.gz', '*.json', '*.json.gz', '*.csv', '*.tsv']:
                found = glob.glob(os.path.join(local_dir, '**', ext), recursive=True)
                data_files.extend(found)

            logging.info(f"Found {len(data_files)} potential data files")

            if data_files:
                # Prioritize files with 'train' in the name
                train_files = [f for f in data_files if 'train' in os.path.basename(f).lower()]
                if train_files:
                    train_file = train_files[0]
                else:
                    # Just take the first data file we found
                    train_file = data_files[0]

                logging.info(f"Selected data file: {train_file}")

                # Load the file based on its extension
                if train_file.endswith('.gz'):
                    compression = 'gzip'
                else:
                    compression = None

                if train_file.endswith('.csv'):
                    df = pd.read_csv(train_file, compression=compression)
                elif train_file.endswith('.tsv'):
                    df = pd.read_csv(train_file, sep='\t', compression=compression)
                else:  # json or jsonl
                    df = pd.read_json(train_file, lines=True, compression=compression)
            else:
                # Last resort - create synthetic data for testing
                logging.warning("No data files found! Creating synthetic data for testing.")
                df = pd.DataFrame({
                    'title': [f"Product {i}" for i in range(100)],
                    'description': [f"This is a description for product {i}" for i in range(100)],
                    'category': [f"Category {i % 5}" for i in range(100)]
                })

    except Exception as e:
        logging.error(f"Failed to load data: {e}")
        # Create synthetic data as fallback
        logging.warning("Creating synthetic data as fallback")
        df = pd.DataFrame({
            'title': [f"Product {i}" for i in range(100)],
            'description': [f"This is a description for product {i}" for i in range(100)],
            'category': [f"Category {i % 5}" for i in range(100)]
        })

    # Sample & clean
    if len(df) > cfg.sample_size:
        df = df.sample(cfg.sample_size, random_state=cfg.seed).reset_index(drop=True)
    else:
        df = df.reset_index(drop=True)
    logging.info(f"Loaded {len(df)} rows")

    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""].reset_index(drop=True)
    logging.info(f"{len(df)} rows after cleaning")
    return df

# 4️⃣ Make train/val pairs
def make_pairs(df, cfg):
    random.seed(cfg.seed)
    idx = df.index.tolist()
    pos_t = int(cfg.sample_size * 0.2)
    neg_t = int(cfg.sample_size * 0.4)
    pos, neg = [], []
    # Positive by same category if exists
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        per = max(1, pos_t // max(1, len(cats)))
        for c in cats:
            ids = df[df["category"] == c].index.tolist()
            if len(ids) < 2: continue
            for _ in range(per):
                i,j = random.sample(ids,2)
                pos.append((i,j,1.0))
    while len(pos) < pos_t:
        i,j = random.sample(idx,2)
        pos.append((i,j,1.0))
    # Negative by different categories
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        while len(neg) < neg_t and len(cats) > 1:
            c1,c2 = random.sample(cats,2)
            a = df[df["category"]==c1].index.tolist()
            b = df[df["category"]==c2].index.tolist()
            if not a or not b: continue
            neg.append((random.choice(a), random.choice(b), 0.0))
    while len(neg) < neg_t:
        i,j = random.sample(idx,2)
        neg.append((i,j,0.0))
    pairs = pos + neg
    random.shuffle(pairs)
    examples = [ InputExample(texts=[df.loc[i].product_text, df.loc[j].product_text], label=lab)
                 for i,j,lab in pairs ]
    split = int(len(examples) * cfg.train_val_split)
    train_ex, val_ex = examples[:split], examples[split:]
    logging.info(f"Pairs → {len(train_ex)} train / {len(val_ex)} val")
    return train_ex, val_ex

# 5️⃣ Train & save
def train_and_save(cfg):
    df = load_data(cfg)
    train_ex, val_ex = make_pairs(df, cfg)
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length
    train_loader = DataLoader(train_ex, shuffle=True, batch_size=cfg.train_batch_size)
    loss = losses.CosineSimilarityLoss(model)
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_ex, name="val")
    warm_steps = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Training {cfg.num_epochs} epochs, warmup_steps={warm_steps}")
    model.fit(
        train_objectives=[(train_loader, loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warm_steps,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True
    )
    logging.info(f"Model saved to {cfg.output_path}")

train_and_save(cfg)

# 6️⃣ Backup to Drive
drive.mount('/content/drive', force_remount=True)
dest = os.path.join("/content/drive/MyDrive", os.path.basename(cfg.output_path))
!cp -r "{cfg.output_path}" "{dest}"
logging.info(f"Model backed up to {dest}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m17.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0m

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.56M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]



Fetching 39 files:   0%|          | 0/39 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  """Create `ConcatenationTable` from list of tables.
  result: list[list[TableBlock]], blocks: list[list[TableBlock]]


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [None]:
# Single Colab Cell: Install, train on WDC Products 2017, and back up to Drive

# 1️⃣ Install dependencies
!pip install -q --upgrade torch torchvision torchaudio sentence-transformers pandas huggingface_hub

# 2️⃣ Imports & Config
import os, random, logging
import pandas as pd, numpy as np, torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from huggingface_hub import snapshot_download
from google.colab import drive

class Config:
    model_name       = "BAAI/bge-small-en-v1.5"
    config_name      = "computers_large"
    sample_size      = 5000
    train_batch_size = 32
    num_epochs       = 3
    max_seq_length   = 512
    learning_rate    = 2e-5
    train_val_split  = 0.8
    output_path      = "/content/bge-fine-tuned-wdc-products"
    seed             = 42
    cache_dir        = "/content/wdc_cache"

cfg = Config()
logging.basicConfig(
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)
logging.info(f"GPU available: {torch.cuda.is_available()}, device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}")

# 3️⃣ Load & clean data
def clean_text(text):
    if not text or pd.isna(text): return ""
    return str(text).replace('"', "").replace("@en", "").strip()

def create_product_text(row):
    parts = []
    for col, val in row.items():
        if pd.notna(val) and str(val).strip():
            parts.append(f"{col}: {clean_text(val)}")
    return "\n".join(parts) or "Unknown product"

def load_data(cfg):
    logging.info("Downloading dataset snapshot…")
    local_dir = snapshot_download(
        repo_id="wdc/products-2017",
        repo_type="dataset",
        cache_dir=cfg.cache_dir
    )
    logging.info(f"Snapshot at {local_dir}")
    # Search for any JSON/JSONL/GZ file
    candidate_files = []
    for root, _, files in os.walk(local_dir):
        for fn in files:
            if fn.endswith(('.json', '.jsonl', '.gz')):
                candidate_files.append(os.path.join(root, fn))
    if not candidate_files:
        raise FileNotFoundError("No JSON, JSONL, or GZ files found in snapshot.")
    # Log all candidate files for debugging
    logging.info(f"Found candidate files: {candidate_files}")
    # Prefer files with 'small' or 'medium' in the name, or take the first one
    train_file = None
    for fn in candidate_files:
        if 'small' in fn.lower() or 'medium' in fn.lower():
            train_file = fn
            break
    if not train_file:
        train_file = candidate_files[0]  # Fallback to first file
    logging.info(f"Selected file: {train_file}")
    # Read via pandas
    try:
        if train_file.endswith(".gz"):
            df = pd.read_json(train_file, lines=True, compression="gzip")
        else:
            df = pd.read_json(train_file, lines=True)
    except Exception as e:
        raise ValueError(f"Failed to read {train_file}: {str(e)}")
    # Sample & clean
    if len(df) > cfg.sample_size:
        df = df.sample(cfg.sample_size, random_state=cfg.seed).reset_index(drop=True)
    else:
        df = df.reset_index(drop=True)
    logging.info(f"Loaded {len(df)} rows")
    df["product_text"] = df.apply(create_product_text, axis=1)
    df = df[df["product_text"].str.strip() != ""].reset_index(drop=True)
    logging.info(f"{len(df)} rows after cleaning")
    return df

# 4️⃣ Make train/val pairs
def make_pairs(df, cfg):
    random.seed(cfg.seed)
    idx = df.index.tolist()
    pos_t = int(cfg.sample_size * 0.2)
    neg_t = int(cfg.sample_size * 0.4)
    pos, neg = [], []
    # Positive by same category if exists
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        per = max(1, pos_t // max(1, len(cats)))
        for c in cats:
            ids = df[df["category"] == c].index.tolist()
            if len(ids) < 2: continue
            for _ in range(per):
                i, j = random.sample(ids, 2)
                pos.append((i, j, 1.0))
    while len(pos) < pos_t:
        i, j = random.sample(idx, 2)
        pos.append((i, j, 1.0))
    # Negative by different categories
    if "category" in df.columns:
        cats = df["category"].dropna().unique().tolist()
        while len(neg) < neg_t and len(cats) > 1:
            c1, c2 = random.sample(cats, 2)
            a = df[df["category"] == c1].index.tolist()
            b = df[df["category"] == c2].index.tolist()
            if not a or not b: continue
            neg.append((random.choice(a), random.choice(b), 0.0))
    while len(neg) < neg_t:
        i, j = random.sample(idx, 2)
        neg.append((i, j, 0.0))
    pairs = pos + neg
    random.shuffle(pairs)
    examples = [InputExample(texts=[df.loc[i].product_text, df.loc[j].product_text], label=lab)
                for i, j, lab in pairs]
    split = int(len(examples) * cfg.train_val_split)
    train_ex, val_ex = examples[:split], examples[split:]
    logging.info(f"Pairs → {len(train_ex)} train / {len(val_ex)} val")
    return train_ex, val_ex

# 5️⃣ Train & save
def train_and_save(cfg):
    df = load_data(cfg)
    train_ex, val_ex = make_pairs(df, cfg)
    model = SentenceTransformer(cfg.model_name)
    model.max_seq_length = cfg.max_seq_length
    train_loader = DataLoader(train_ex, shuffle=True, batch_size=cfg.train_batch_size)
    loss = losses.CosineSimilarityLoss(model)
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_ex, name="val")
    warm_steps = int(len(train_loader) * cfg.num_epochs * 0.1)
    logging.info(f"Training {cfg.num_epochs} epochs, warmup_steps={warm_steps}")
    model.fit(
        train_objectives=[(train_loader, loss)],
        evaluator=evaluator,
        epochs=cfg.num_epochs,
        warmup_steps=warm_steps,
        optimizer_params={"lr": cfg.learning_rate},
        output_path=cfg.output_path,
        save_best_model=True
    )
    logging.info(f"Model saved to {cfg.output_path}")

train_and_save(cfg)

# 6️⃣ Backup to Drive
drive.mount('/content/drive', force_remount=True)
dest = os.path.join("/content/drive/MyDrive", os.path.basename(cfg.output_path))
!cp -r "{cfg.output_path}" "{dest}"
logging.info(f"Model backed up to {dest}")

Fetching 39 files:   0%|          | 0/39 [00:00<?, ?it/s]

  """Create `ConcatenationTable` from list of tables.
  result: list[list[TableBlock]], blocks: list[list[TableBlock]]


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbsse1415[0m ([33mbsse1415-university-of-dhaka[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Val Pearson Cosine,Val Spearman Cosine
75,No log,No log,0.035507,0.040145
150,No log,No log,0.042013,0.049757
225,No log,No log,0.043767,0.048513


Mounted at /content/drive
