# Deliverable 3

SNumbers: u264332, u264443, u264202

Names: Levente Olivér Bódi, Riccardo Zamuner, Giada Izzo

## Previous deliverable code

This is the same code of the previous deliverable minus prints and commentary

In [None]:
import re
import json
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/just_riccio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/just_riccio/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
def preprocess_text(text):
    """
    Preprocess a text by tokenizing, lowercasing, removing stop words, and stemming.
    """
    
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation
    tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]

    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Remove empty strings
    tokens = [token for token in tokens if token]

    return tokens

def clean_seller(text):
    """
    Clean the seller field by removing unwanted trailing phrases.
    """

    # Remove unwanted trailing phrases and everything after them
    remove_phrases = [
        "Seller changed",
        "(Not Enough Ratin",
        "(New Sell"
    ]
    for phrase in remove_phrases:
        idx = text.find(phrase)
        if idx != -1:
            text = text[:idx]
    return text.strip()

def preprocess_non_textual(document):
    """
    Preprocess non-textual fields in the document.
    """

    # Discount preprocessing: convert from string "xx% off" to integer xx
    # also taking into account documents without discount
    if isinstance(document["discount"], str) and "%" in document["discount"]:
        document["discount"] = int(document["discount"][:document["discount"].find("%")])
    else:
        document["discount"] = 0
        
    # Merge all values from product_details dictionary and preprocess
    if isinstance(document["product_details"], dict):
        details_text = " ".join(str(v) for v in document["product_details"].values())
    elif isinstance(document["product_details"], list):
        # If it's a list of dicts, merge all values from all dicts
        details_text = " ".join(str(v) for d in document["product_details"] if isinstance(d, dict) for v in d.values())
    else:
        details_text = str(document["product_details"])
    document["product_details"] = preprocess_text(details_text)

    # Convert actual_price and selling_price to integers (remove commas)
    # If actual_price is NaN, set it to discounted selling_price
    for price_field in ["actual_price", "selling_price"]:
        if isinstance(document[price_field], str):
            price_str = document[price_field].replace(",", "")
            price_val = price_str.split(".")[0]
            document[price_field] = int(price_val) if price_val.isdigit() else 0

    # If actual_price is missing or zero, set it to discounted selling_price
    if ("actual_price" not in document or document["actual_price"] == 0) and "selling_price" in document:
        document["actual_price"] = int(int(document["selling_price"])*document["discount"]/100)

    # Convert average_rating to float, set to NaN if missing or empty
    if "average_rating" in document and str(document["average_rating"]).strip() != "":
        try:
            document["average_rating"] = float(document["average_rating"])
        except ValueError:
            document["average_rating"] = float("nan")
    else:
        document["average_rating"] = float("nan")

    return document

def preprocess_document(document):
    """
    Join all preprocessing steps for a document.
    """

    document["description"] = preprocess_text(document["description"])
    document["title"] = preprocess_text(document["title"])
    document["seller"] = clean_seller(document["seller"])
    document["brand"] = document["brand"].lower().split()

    document = preprocess_non_textual(document)

    return document


In [3]:
# MODIFY THIS PATH AS NEEDED
file_path = "../../data/fashion_products_dataset.json"

with open(file_path, "r") as f:
    data = json.load(f)
    df = pd.DataFrame(data)

In [4]:
def impute_actual_price(row):
    # if actual_price is empty, try to compute it:
    # either from selling_price and discount, or just use selling_price
    if row['actual_price'] == '':
        # Convert selling_price and discount to float if not empty
        if row['selling_price'] != '' and row['discount'] != '':
            selling_price = float(str(row['selling_price']).replace(',', ''))
            discount = float(str(row['discount']).replace('%', '').replace('off', '').strip())
            return selling_price * (1 - discount / 100)
        elif row['selling_price'] != '':
            return float(str(row['selling_price']).replace(',', ''))
    return row['actual_price']

df['discount'] = df['discount'].replace('', '0')
df['actual_price'] = df.apply(impute_actual_price, axis=1)

In [5]:
# Drop the remaining products without price
df = df[(df['actual_price'] != '') & (df['selling_price'] != '')]

In [6]:
# Replace empty brand names with 'no brand'
df.loc[df['brand'] == '', 'brand'] = 'no brand'

In [7]:
df = df.apply(preprocess_document, axis=1)

In [8]:
from collections import defaultdict

def normalize_cat_token(val):
    if pd.isna(val) or str(val).strip() == "":
        return []
    # split common multi-value strings; keep a single value as 1-item list
    parts = re.split(r"[\/,;|]", str(val))
    return [re.sub(r"\s+", "_", p.strip().lower()) for p in parts if p.strip()]

In [9]:
def build_inverted_index_df(df: pd.DataFrame,id_col: str | None = None,text_cols: list[str] = ("title", "description", "product_details"),min_df: int = 1,store_positions: bool = False):
    """
    df: preprocessed dataframe (title/description/product_details are token lists).
    id_col: column holding unique ids; if None, uses df.index (as str).
    text_cols: columns with *token lists* (already stemmed, stopwords removed).
    min_df: drop terms that appear in < min_df documents.
    store_positions: if True, also keep term positions for phrase/proximity queries.
    """
    # assign doc ID's
    doc_ids = df[id_col].astype(str).tolist() if id_col else df.index.astype(str).tolist()

    per_doc_terms = []
    per_doc_sequence = []

    # gather tokens for each row
    for _, row in df.iterrows():
        tokens = []

        # text cols are already tokenized lists after preprocess_document(), we just make it robust if something slipped through
        for c in text_cols:
            if c in df.columns:
                vals = row[c]
                if isinstance(vals, (list, tuple)):
                    tokens.extend([str(t).lower() for t in vals if str(t).strip()])
                elif pd.notna(vals):
                    # if something slipped through as string, tokenize lightly:
                    tokens.extend(re.findall(r"[A-Za-z0-9]+", str(vals).lower()))


        # ensure we have a sequence for positions and a set for boolean presence
        if store_positions:
            per_doc_sequence.append(tokens[:])
        per_doc_terms.append(set(tokens))

    # build postings (term -> list[doc_id]) and df counts
    postings_tmp = defaultdict(list)
    df_count = defaultdict(int)

    for d_i, terms in enumerate(per_doc_terms):
        did = doc_ids[d_i]
        for term in terms:
            postings_tmp[term].append(did)
            df_count[term] += 1

    # min_df filter + sort postings
    postings_tmp = {t: sorted(dids) for t, dids in postings_tmp.items() if df_count[t] >= min_df}

    # vocab
    vocab = {term: tid for tid, term in enumerate(sorted(postings_tmp.keys()))}
    id2term = {tid: term for term, tid in vocab.items()}

    # final inverted index (term_id -> [doc_ids])
    inv_index = {vocab[t]: dids for t, dids in postings_tmp.items()}

    # positional index
    positional = None
    if store_positions:
        positional = {tid: defaultdict(list) for tid in inv_index.keys()}
        for d_i, seq in enumerate(per_doc_sequence):
            did = doc_ids[d_i]
            for pos, tok in enumerate(seq):
                if tok in vocab:
                    tid = vocab[tok]
                    positional[tid][did].append(pos)
        # convert inner dicts to normal dicts
        positional = {tid: dict(dmap) for tid, dmap in positional.items()}

    return {
        "vocab": vocab,            # term -> term_id
        "id2term": id2term,        # term_id -> term
        "postings": inv_index,     # term_id -> [doc_id, ...] (sorted)
        "doc_ids": doc_ids,        # all doc ids, as strings
        "positional": positional   # optional: term_id -> {doc_id: [positions]}
    }

index_obj = build_inverted_index_df(
    df,
    id_col=None,
    text_cols=df.columns,
    min_df=1,
    store_positions=False
)

In [10]:
# Quick lookups
def docs_for_term(term: str):
    """Return document IDs for a raw term or categorical token (e.g., 'brand:nike')."""
    tid = index_obj["vocab"].get(term)
    return index_obj["postings"].get(tid, []) if tid is not None else []

def doc_positions_for_term(term: str, doc_id: str):
    """Return positions of term in a specific document."""
    tid = index_obj["vocab"].get(term)
    if tid is None:
        return []
    return index_obj["positional"].get(tid, {}).get(doc_id, [])

def and_query(terms: list[str]):
    """Boolean AND over terms."""
    sets = [set(docs_for_term(t)) for t in terms]
    return sorted(set.intersection(*sets)) if sets else []

def or_query(terms: list[str]):
    """Boolean OR over terms."""
    s = set()
    for t in terms:
        s.update(docs_for_term(t))
    return sorted(s)


In [11]:
test_queries = {
    "Q1": "cotton tshirt 50 100 men blue",
    "Q2": "adidas red",
    "Q3": "denim jean skinny",
    "Q4": "dress red",
    "Q5": "leather jacket"
}

In [12]:
def calculate_tf(word, document):
    """
    Calculate term frequency for a word in a document.
    TF = Number of times term t appears in a document
    """
    return document.count(word)    
    

def calculate_idf(word, all_documents):
    """
    Calculate inverse document frequency for a word.
    IDF = log(Total number of documents / Number of documents containing term t)
    """
    num_documents_with_term = len(docs_for_term(word))
    if num_documents_with_term == 0:
        return 0
    return np.log(len(all_documents) / num_documents_with_term)

def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0
    return dot_product / (norm_vec1 * norm_vec2)


In [13]:
def rank_documents(query, documents, k):
    """
    Rank documents based on TF-IDF scores for the given query.
    Return the top k documents.
    """
    all_documents = [doc["description"] + doc["title"] + doc["brand"] for index, doc in documents.iterrows()]
    scores = []

    term_idfs = {term: calculate_idf(term, all_documents) for term in query}
    query_vector = np.array([calculate_tf(term, query) * term_idfs[term] for term in query])

    for index, doc in documents.iterrows():
        doc_vec = []
        doc_text = doc["description"] + doc["title"] + doc["brand"]
        for term in query:
            tf = calculate_tf(term, doc_text)
            if tf > 0:
                # used the formula tf = 1 + log_10(count)
                doc_vec.append((1 + np.log(tf)) * term_idfs[term])
            else:
                doc_vec.append(0)
        scores.append((doc, cosine_similarity(query_vector, np.array(doc_vec))))

    # Sort documents by score in descending order
    ranked_docs = sorted(scores, key=lambda x: x[1], reverse=True)

    return ranked_docs[:k]

In [14]:
def precision_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Precision@k.
    Precision@k = (Number of relevant documents retrieved in top k) / k
    """
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = sum(1 for doc in retrieved_at_k if doc["pid"] in relevant_docs)
    return relevant_retrieved / k if k > 0 else 0

def recall_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Recall@k.
    Recall@k = (Number of relevant documents retrieved in top k) / (Total number of relevant documents)
    """
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = sum(1 for doc in retrieved_at_k if doc["pid"] in relevant_docs)
    total_relevant = len(relevant_docs)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0

def average_precision_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Average Precision@k.
    AP@k = Average of Precision@i for each relevant document retrieved in top k
    """
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = 0
    precision_sum = 0

    for i, doc in enumerate(retrieved_at_k, start=1):
        if doc["pid"] in relevant_docs:
            relevant_retrieved += 1
            precision_sum += relevant_retrieved / i

    return precision_sum / relevant_retrieved if relevant_retrieved > 0 else 0

def f1_score(precision, recall):
    """
    Calculate F1 Score.
    F1 = 2 * (Precision * Recall) / (Precision + Recall)
    """
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def f1_score_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate F1 Score at k.
    """
    precision = precision_at_k(retrieved_docs, relevant_docs, k)
    recall = recall_at_k(retrieved_docs, relevant_docs, k)
    return f1_score(precision, recall)

def mean_average_precision(retrieved_docs_list, relevant_docs_list, k):
    """
    Calculate Mean Average Precision (MAP) at k.
    MAP = Mean of Average Precision@k over all queries
    """
    ap_sum = 0
    num_queries = len(retrieved_docs_list)

    for retrieved_docs, relevant_docs in zip(retrieved_docs_list, relevant_docs_list):
        ap_sum += average_precision_at_k(retrieved_docs, relevant_docs, k)

    return ap_sum / num_queries if num_queries > 0 else 0

def reciprocal_rank(retrieved_docs, relevant_docs):
    """
    Calculate Reciprocal Rank (RR).
    RR = 1 / Rank of the first relevant document
    """
    rank = 0
    for i, doc in enumerate(retrieved_docs):
        if doc["pid"] in relevant_docs:
            rank = i + 1
            break
    return 1 / rank if rank > 0 else 0

def mean_reciprocal_rank(retrieved_docs_list, relevant_docs_list):
    """
    Calculate Mean Reciprocal Rank (MRR).
    MRR = Mean of Reciprocal Ranks over all queries
    """
    rr_sum = 0
    num_queries = len(retrieved_docs_list)

    for retrieved_docs, relevant_docs in zip(retrieved_docs_list, relevant_docs_list):
        rr_sum += reciprocal_rank(retrieved_docs, relevant_docs)

    return rr_sum / num_queries if num_queries > 0 else 0

def dcg_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Discounted Cumulative Gain (DCG) at k.
    DCG@k = Sum of (relevance of document at rank i) / log2(i + 1) for i in 1 to k
    """
    dcg = 0
    for i in range(min(k, len(retrieved_docs))):
        doc = retrieved_docs[i]
        if doc["pid"] in relevant_docs:
            relevance = 1  # we only have binary relevance
        else:
            relevance = 0
        dcg += relevance / np.log2(i + 2)  # i + 2 because i starts from 0
    return dcg

def ndcg_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Normalized Discounted Cumulative Gain (NDCG) at k.
    NDCG@k = DCG@k / IDCG@k
    """
    dcg = dcg_at_k(retrieved_docs, relevant_docs, k)
    
    ideal_retrieved_docs = [{"pid": pid} for pid in relevant_docs]
    idcg = dcg_at_k(ideal_retrieved_docs, relevant_docs, k)
    
    return dcg / idcg if idcg > 0 else 0

## Ranking & Filtering

In this part, we implement a retrieval pipeline that:
- Takes a text query as input.
- Finds all documents that contain all query terms (AND semantics).
- Sorts the matching documents by relevance using multiple ranking methods:
  1. TF‑IDF + cosine similarity
  2. BM25
  3. Our custom score (text relevance + numeric feature boosts)

We also implement a Word2Vec + cosine ranking and return top 20 lists for the 5 queries defined in Part 2.

### Global text corpus statistics for ranking
We precompute text tokens per document (description + title + brand), document lengths, and per term document frequencies over these fields, which we will reuse for TF‑IDF and BM25.

In [None]:
# Build a text-only view per document (description + title + brand)
ALL_TEXT_DOCS = [row["description"] + row["title"] + row["brand"] for _, row in df.iterrows()]
N_TEXT = len(ALL_TEXT_DOCS)

# Document lengths and average length
DOC_LENGTHS = np.array([len(toks) for toks in ALL_TEXT_DOCS], dtype=float)
AVG_DL = float(DOC_LENGTHS.mean()) if N_TEXT > 0 else 0.0

# Document frequency per term over the text only view
from collections import Counter
DF_TEXT = Counter()
for toks in ALL_TEXT_DOCS:
    DF_TEXT.update(set(toks))

def idf_text(term: str) -> float:
    """IDF using text only df: log(N / df)"""
    df_t = DF_TEXT.get(term, 0)
    if df_t == 0 or N_TEXT == 0:
        return 0.0
    return np.log(N_TEXT / df_t)

def idf_bm25(term: str) -> float:
    """BM25 IDF: log((N - df + 0.5) / (df + 0.5) + 1)"""
    df_t = DF_TEXT.get(term, 0)
    return np.log(((N_TEXT - df_t + 0.5) / (df_t + 0.5)) + 1.0)

### AND-conjunctive retrieval pipeline
We standardize query preprocessing to match the document pipeline and use the inverted index to fetch conjunctive candidates before ranking.

In [None]:
def preprocess_query(q):
    """Accepts a string or list of strings; returns normalized tokens using the same preprocessing as documents"""
    if isinstance(q, str):
        return preprocess_text(q)
    elif isinstance(q, (list, tuple)):
        return preprocess_text(" ".join(map(str, q)))
    else:
        return []

def retrieve_conjunctive_candidates(query_tokens):
    """Return candidate doc indices (list of ints) where all query terms appear (AND)"""
    if not query_tokens:
        return []
    dids = and_query(query_tokens)
    return [int(d) for d in dids]

def doc_text_tokens(row):
    return row["description"] + row["title"] + row["brand"]

def cosine(v1, v2):
    return cosine_similarity(v1, v2)

### Ranking methods
We provide three ranking functions over the conjunctive candidates:
- TF‑IDF + cosine similarity
- BM25
- Custom hybrid score (text relevance + numeric boosts)

In [None]:
def rank_tfidf_cosine(query_tokens, candidate_indices, k=10):
    # Unique terms to define vector space
    terms = list(dict.fromkeys(query_tokens))
    # Query vector (tf-idf)
    q_vec = np.array([(1 + np.log(query_tokens.count(t))) * idf_text(t) if query_tokens.count(t) > 0 else 0.0 for t in terms])

    scored = []
    for idx in candidate_indices:
        row = df.iloc[idx]
        toks = doc_text_tokens(row)
        d_vec = []
        for t in terms:
            tf = toks.count(t)
            d_vec.append(((1 + np.log(tf)) * idf_text(t)) if tf > 0 else 0.0)
        d_vec = np.array(d_vec)
        score = cosine(q_vec, d_vec)
        scored.append((row, score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:k]

def rank_bm25(query_tokens, candidate_indices, k=10, k1=1.5, b=0.75):
    scored = []
    for idx in candidate_indices:
        row = df.iloc[idx]
        toks = doc_text_tokens(row)
        dl = len(toks)
        score = 0.0
        for t in set(query_tokens):
            tf = toks.count(t)
            if tf == 0:
                continue
            idf = idf_bm25(t)
            denom = tf + k1 * (1 - b + b * (dl / (AVG_DL if AVG_DL > 0 else 1.0)))
            score += idf * (tf * (k1 + 1)) / denom
        scored.append((row, score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:k]

# Global numeric ranges for normalization
SELL_MIN, SELL_MAX = float(pd.to_numeric(df['selling_price'], errors='coerce').min()), float(pd.to_numeric(df['selling_price'], errors='coerce').max())
DISC_MIN, DISC_MAX = float(pd.to_numeric(df['discount'], errors='coerce').min()), float(pd.to_numeric(df['discount'], errors='coerce').max())
RAT_MIN, RAT_MAX = 0.0, 5.0  # ratings are on 0..5 scale

def _norm(x, lo, hi):
    try:
        xv = float(x)
    except Exception:
        xv = 0.0
    if hi <= lo:
        return 0.0
    return (xv - lo) / (hi - lo)

def rank_custom_hybrid(query_tokens, candidate_indices, k=10,
                       base='bm25', k1=1.5, b=0.75,
                       w_rating=0.30, w_discount=0.20, w_price=0.10):
    """
    Custom score = BaseTextScore * (1 + w_rating*rating_norm + w_discount*discount_norm - w_price*price_norm)
    - BaseTextScore: BM25 (default) or TF‑IDF cosine
    - Boosts: higher rating and discount help; higher price penalizes slightly
    """
    # Precompute base scores once
    if base == 'bm25':
        base_scored = rank_bm25(query_tokens, candidate_indices, k=len(candidate_indices), k1=k1, b=b)
    else:
        base_scored = rank_tfidf_cosine(query_tokens, candidate_indices, k=len(candidate_indices))

    out = []
    for row, base_score in base_scored:
        rating = 0.0 if pd.isna(row.get('average_rating', np.nan)) else float(row['average_rating'])
        discount = row.get('discount', 0)
        price = row.get('selling_price', 0)

        rating_n = _norm(rating, RAT_MIN, RAT_MAX)
        discount_n = _norm(discount, DISC_MIN, DISC_MAX)
        price_n = _norm(price, SELL_MIN, SELL_MAX)

        factor = 1.0 + (w_rating * rating_n) + (w_discount * discount_n) - (w_price * price_n)
        final_score = float(base_score) * max(factor, 0.0)
        out.append((row, final_score))

    return sorted(out, key=lambda x: x[1], reverse=True)[:k]

### TF‑IDF vs BM25: Pros and Cons

- TF‑IDF + cosine
  - Pros: simple, fast, well understood; natural cosine normalization makes it robust to document length to some extent.
  - Cons: raw tf grows unbounded and favors long documents; no saturation, so additional occurrences keep boosting; length normalization is implicit and weaker than BM25.

- BM25
  - Pros: tf saturation (diminishing returns); explicit length normalization with parameter b; strong and robust baseline in IR.
  - Cons: requires hyperparameters (k1, b); scores are not normalized to [0,1] which can make mixing with other features less straightforward.

Our custom hybrid score starts from a strong text base (BM25) and adds interpretable business signals: higher rating and bigger discount are preferred, while very high price is slightly penalized. This can better reflect user utility when text matches are similar. Downsides: requires choosing weights and assumes the same utility for all users (no personalization).

### Conjunctive retrieval + ranking (TF‑IDF, BM25, Custom)
We run the 5 queries from Part 2 through the conjunctive filter and show the top‑10 pids for each ranking method.

In [18]:
def top_k_pid_title(scored, k=10):
    out = []
    for row, s in scored[:k]:
        title_str = " ".join(row['title']) if isinstance(row['title'], list) else str(row['title'])
        out.append((row['pid'], round(float(s), 4), title_str))
    return out

demo_results = {}
for qid, q_terms in test_queries.items():
    q_tokens = preprocess_query(q_terms)
    cand_idx = retrieve_conjunctive_candidates(q_tokens)
    tfidf_res = rank_tfidf_cosine(q_tokens, cand_idx, k=10)
    bm25_res = rank_bm25(q_tokens, cand_idx, k=10)
    custom_res = rank_custom_hybrid(q_tokens, cand_idx, k=10, base='bm25')
    demo_results[qid] = {
        'TFIDF': top_k_pid_title(tfidf_res, 10),
        'BM25': top_k_pid_title(bm25_res, 10),
        'CUSTOM': top_k_pid_title(custom_res, 10)
    }

In [24]:
for qid, res in demo_results.items():
    print(f"{qid} TFIDF top-10:", res['TFIDF'])
    print(f"{qid} BM25 top-10:", res['BM25'])
    print(f"{qid} CUSTOM top-10:", res['CUSTOM'])

Q1 TFIDF top-10: [('TSHFZKM8HSCZDQER', 0.2952, 'graphic print men round neck black dark blue tshirt pack 2'), ('TSHFZPENQKGHKYTU', 0.2952, 'graphic print men round neck dark blue red tshirt pack 2'), ('TSHFZQ3HHXFYTQJG', 0.2952, 'graphic print men round neck pink dark blue tshirt pack 2'), ('TSHFZQZAZD8MYBGZ', 0.2952, 'graphic print men round neck beig dark blue tshirt pack 2'), ('TSHFZQZAE9C5UTTD', 0.2952, 'graphic print men round neck white dark blue tshirt pack 2'), ('TSHFZ9JSB5MB9AAG', 0.2936, 'solid men v neck dark blue tshirt'), ('TSHFZ9K3AX2MBCPM', 0.2936, 'solid men round neck blue tshirt'), ('TSHFKKC5HS8TEJZW', 0.2936, 'stripe men polo neck blue tshirt'), ('TSHFYPGNHMKRUWAH', 0.2923, 'print men round neck dark blue tshirt'), ('TSHFYWFQMUCGYVUV', 0.2854, 'graphic print women round neck dark blue tshirt')]
Q1 BM25 top-10: [('TSHFYPGNHMKRUWAH', 6.3481, 'print men round neck dark blue tshirt'), ('TSHFNWZUHE6PTYUG', 6.1676, 'typographi women round neck blue tshirt'), ('TSHFZ9K3AX2M

## Word2Vec + cosine ranking
We train a Word2Vec model on the corpus (description + title + brand tokens). A query or document is represented by averaging the vectors of its words. We then compute cosine similarity between the query vector and candidate document vectors.

We return the top 20 documents for each of the 5 queries under AND semantics.

In [20]:
try:
    from gensim.models import Word2Vec
except ImportError:
    raise ImportError("error")

w2v_dim = 100
w2v_model = Word2Vec(
    sentences=ALL_TEXT_DOCS,
    vector_size=w2v_dim,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    epochs=10
)

In [21]:
def average_w2v(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return None
    return np.mean(vecs, axis=0)

def rank_w2v_cosine(query_tokens, candidate_indices, k=20):
    q_vec = average_w2v(query_tokens, w2v_model)
    if q_vec is None:
        return []
    scored = []
    for idx in candidate_indices:
        row = df.iloc[idx]
        d_vec = average_w2v(doc_text_tokens(row), w2v_model)
        if d_vec is None:
            continue
        score = cosine(q_vec, d_vec)
        scored.append((row, score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:k]

In [22]:
w2v_results = {}
for qid, q_terms in test_queries.items():
    q_tokens = preprocess_query(q_terms)
    cand_idx = retrieve_conjunctive_candidates(q_tokens)
    w2v_scored = rank_w2v_cosine(q_tokens, cand_idx, k=20)
    w2v_results[qid] = [(row['pid'], round(float(score), 4), " ".join(row['title']) if isinstance(row['title'], list) else str(row['title'])) for row, score in w2v_scored]

In [25]:
print("Top 20 by Word2Vec + cosine (conjunctive):")
for qid in ["Q1","Q2","Q3","Q4","Q5"]:
    print(f"{qid}:", w2v_results.get(qid, []))

Top 20 by Word2Vec + cosine (conjunctive):
Q1: [('TSHFH8HGZQYERVKF', 0.8011, 'solid men polo neck red white blue tshirt pack 3'), ('TSHFUNN2PHXF7GUH', 0.7647, 'solid men round neck dark blue tshirt'), ('TSHEU7DTUYMHMGW6', 0.7595, 'stripe women polo neck blue tshirt'), ('TSHFZQZAZD8MYBGZ', 0.7558, 'graphic print men round neck beig dark blue tshirt pack 2'), ('TSHFNWZUHE6PTYUG', 0.7549, 'typographi women round neck blue tshirt'), ('TSHFUNN2GGPB4PEH', 0.754, 'print women round neck white blue tshirt'), ('TSHFZ9JSB5MB9AAG', 0.7525, 'solid men v neck dark blue tshirt'), ('TSHFZQ3HHXFYTQJG', 0.7525, 'graphic print men round neck pink dark blue tshirt pack 2'), ('TSHFZKM8HSCZDQER', 0.7513, 'graphic print men round neck black dark blue tshirt pack 2'), ('TSHFUNN3ZJEKSAEV', 0.7512, 'print men round neck dark blue tshirt'), ('TSHFZPENQKGHKYTU', 0.7506, 'graphic print men round neck dark blue red tshirt pack 2'), ('TSHFZQZAE9C5UTTD', 0.7496, 'graphic print men round neck white dark blue tshirt p

## Beyond Word2Vec: better text representations?

- Doc2Vec: learns document embeddings directly instead of averaging word vectors.
  - Pros: captures document‑level semantics and word order context better than plain averaging.
  - Cons: requires training on a large in‑domain corpus; quality depends on hyperparameters and data size.

- Sentence embeddings (e.g., Sentence‑BERT): transformer models that produce semantically meaningful sentence/document vectors.
  - Pros: strong performance on semantic similarity out of the box; captures context and polysemy.
  - Cons: heavier to run; may require domain adaptation for best results; vectors are dense and larger.

- Weighted Word2Vec (IDF‑weighted averaging): cheap improvement over uniform averaging by giving more weight to rarer, more informative terms.
  - Pros: simple, fast, often better than plain average; easy to integrate with our existing IDF.
  - Cons: still bag‑of‑words; limited ability to capture word order or complex context.

Given our dataset size and the need for efficient retrieval, a strong next step would be to use IDF‑weighted Word2Vec or a lightweight sentence embedding model to improve semantic matching while keeping latency acceptable. For high accuracy scenarios (and if compute allows), Sentence‑BERT or similar models generally outperform Word2Vec.