# Deliverable 4

SNumbers: u264332, u264443, u264202

Names: Levente Olivér Bódi, Riccardo Zamuner, Giada Izzo

## Previous deliverable code

This is the same code of the previous deliverable minus prints and commentary

In [1]:
import pandas as pd
import numpy as np
import json
import re
import os
import uuid
import datetime
from flask import Flask, request, render_template_string, jsonify
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [2]:
def preprocess_text(text):
    """
    Preprocess a text by tokenizing, lowercasing, removing stop words, and stemming.
    """
    
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove punctuation
    tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation
    tokens = [re.sub(r"[^\w\s]", "", token) for token in tokens]

    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Remove empty strings
    tokens = [token for token in tokens if token]

    return tokens

def clean_seller(text):
    """
    Clean the seller field by removing unwanted trailing phrases.
    """

    # Remove unwanted trailing phrases and everything after them
    remove_phrases = [
        "Seller changed",
        "(Not Enough Ratin",
        "(New Sell"
    ]
    for phrase in remove_phrases:
        idx = text.find(phrase)
        if idx != -1:
            text = text[:idx]
    return text.strip()

def preprocess_non_textual(document):
    """
    Preprocess non-textual fields in the document.
    """

    # Discount preprocessing: convert from string "xx% off" to integer xx
    # also taking into account documents without discount
    if isinstance(document["discount"], str) and "%" in document["discount"]:
        document["discount"] = int(document["discount"][:document["discount"].find("%")])
    else:
        document["discount"] = 0
        
    # Merge all values from product_details dictionary and preprocess
    if isinstance(document["product_details"], dict):
        details_text = " ".join(str(v) for v in document["product_details"].values())
    elif isinstance(document["product_details"], list):
        # If it's a list of dicts, merge all values from all dicts
        details_text = " ".join(str(v) for d in document["product_details"] if isinstance(d, dict) for v in d.values())
    else:
        details_text = str(document["product_details"])
    document["product_details"] = preprocess_text(details_text)

    # Convert actual_price and selling_price to integers (remove commas)
    # If actual_price is NaN, set it to discounted selling_price
    for price_field in ["actual_price", "selling_price"]:
        if isinstance(document[price_field], str):
            price_str = document[price_field].replace(",", "")
            price_val = price_str.split(".")[0]
            document[price_field] = int(price_val) if price_val.isdigit() else 0

    # If actual_price is missing or zero, set it to discounted selling_price
    if ("actual_price" not in document or document["actual_price"] == 0) and "selling_price" in document:
        document["actual_price"] = int(int(document["selling_price"])*document["discount"]/100)

    # Convert average_rating to float, set to NaN if missing or empty
    if "average_rating" in document and str(document["average_rating"]).strip() != "":
        try:
            document["average_rating"] = float(document["average_rating"])
        except ValueError:
            document["average_rating"] = float("nan")
    else:
        document["average_rating"] = float("nan")

    return document

def preprocess_document(document):
    """
    Join all preprocessing steps for a document.
    """

    document["description"] = preprocess_text(document["description"])
    document["title"] = preprocess_text(document["title"])
    document["seller"] = clean_seller(document["seller"])
    document["brand"] = document["brand"].lower().split()

    document = preprocess_non_textual(document)

    return document


In [3]:
# MODIFY THIS PATH AS NEEDED
file_path = "../../data/fashion_products_dataset.json"

with open(file_path, "r") as f:
    data = json.load(f)
    df = pd.DataFrame(data)

In [4]:
def impute_actual_price(row):
    # if actual_price is empty, try to compute it:
    # either from selling_price and discount, or just use selling_price
    if row['actual_price'] == '':
        # Convert selling_price and discount to float if not empty
        if row['selling_price'] != '' and row['discount'] != '':
            selling_price = float(str(row['selling_price']).replace(',', ''))
            discount = float(str(row['discount']).replace('%', '').replace('off', '').strip())
            return selling_price * (1 - discount / 100)
        elif row['selling_price'] != '':
            return float(str(row['selling_price']).replace(',', ''))
    return row['actual_price']

df['discount'] = df['discount'].replace('', '0')
df['actual_price'] = df.apply(impute_actual_price, axis=1)

In [5]:
# Drop the remaining products without price
df = df[(df['actual_price'] != '') & (df['selling_price'] != '')]

In [6]:
# Replace empty brand names with 'no brand'
df.loc[df['brand'] == '', 'brand'] = 'no brand'

In [7]:
df = df.apply(preprocess_document, axis=1)

In [8]:
from collections import defaultdict

def normalize_cat_token(val):
    if pd.isna(val) or str(val).strip() == "":
        return []
    # split common multi-value strings; keep a single value as 1-item list
    parts = re.split(r"[\/,;|]", str(val))
    return [re.sub(r"\s+", "_", p.strip().lower()) for p in parts if p.strip()]

In [9]:
def build_inverted_index_df(df: pd.DataFrame,id_col: str | None = None,text_cols: list[str] = ("title", "description", "product_details"),min_df: int = 1,store_positions: bool = False):
    """
    df: preprocessed dataframe (title/description/product_details are token lists).
    id_col: column holding unique ids; if None, uses df.index (as str).
    text_cols: columns with *token lists* (already stemmed, stopwords removed).
    min_df: drop terms that appear in < min_df documents.
    store_positions: if True, also keep term positions for phrase/proximity queries.
    """
    # assign doc ID's
    doc_ids = df[id_col].astype(str).tolist() if id_col else df.index.astype(str).tolist()

    per_doc_terms = []
    per_doc_sequence = []

    # gather tokens for each row
    for _, row in df.iterrows():
        tokens = []

        # text cols are already tokenized lists after preprocess_document(), we just make it robust if something slipped through
        for c in text_cols:
            if c in df.columns:
                vals = row[c]
                if isinstance(vals, (list, tuple)):
                    tokens.extend([str(t).lower() for t in vals if str(t).strip()])
                elif pd.notna(vals):
                    # if something slipped through as string, tokenize lightly:
                    tokens.extend(re.findall(r"[A-Za-z0-9]+", str(vals).lower()))


        # ensure we have a sequence for positions and a set for boolean presence
        if store_positions:
            per_doc_sequence.append(tokens[:])
        per_doc_terms.append(set(tokens))

    # build postings (term -> list[doc_id]) and df counts
    postings_tmp = defaultdict(list)
    df_count = defaultdict(int)

    for d_i, terms in enumerate(per_doc_terms):
        did = doc_ids[d_i]
        for term in terms:
            postings_tmp[term].append(did)
            df_count[term] += 1

    # min_df filter + sort postings
    postings_tmp = {t: sorted(dids) for t, dids in postings_tmp.items() if df_count[t] >= min_df}

    # vocab
    vocab = {term: tid for tid, term in enumerate(sorted(postings_tmp.keys()))}
    id2term = {tid: term for term, tid in vocab.items()}

    # final inverted index (term_id -> [doc_ids])
    inv_index = {vocab[t]: dids for t, dids in postings_tmp.items()}

    # positional index
    positional = None
    if store_positions:
        positional = {tid: defaultdict(list) for tid in inv_index.keys()}
        for d_i, seq in enumerate(per_doc_sequence):
            did = doc_ids[d_i]
            for pos, tok in enumerate(seq):
                if tok in vocab:
                    tid = vocab[tok]
                    positional[tid][did].append(pos)
        # convert inner dicts to normal dicts
        positional = {tid: dict(dmap) for tid, dmap in positional.items()}

    return {
        "vocab": vocab,            # term -> term_id
        "id2term": id2term,        # term_id -> term
        "postings": inv_index,     # term_id -> [doc_id, ...] (sorted)
        "doc_ids": doc_ids,        # all doc ids, as strings
        "positional": positional   # optional: term_id -> {doc_id: [positions]}
    }

index_obj = build_inverted_index_df(
    df,
    id_col=None,
    text_cols=df.columns,
    min_df=1,
    store_positions=False
)

In [10]:
# Quick lookups
def docs_for_term(term: str):
    """Return document IDs for a raw term or categorical token (e.g., 'brand:nike')."""
    tid = index_obj["vocab"].get(term)
    return index_obj["postings"].get(tid, []) if tid is not None else []

def doc_positions_for_term(term: str, doc_id: str):
    """Return positions of term in a specific document."""
    tid = index_obj["vocab"].get(term)
    if tid is None:
        return []
    return index_obj["positional"].get(tid, {}).get(doc_id, [])

def and_query(terms: list[str]):
    """Boolean AND over terms."""
    sets = [set(docs_for_term(t)) for t in terms]
    return sorted(set.intersection(*sets)) if sets else []

def or_query(terms: list[str]):
    """Boolean OR over terms."""
    s = set()
    for t in terms:
        s.update(docs_for_term(t))
    return sorted(s)


In [11]:
test_queries = {
    "Q1": "cotton tshirt 50 100 men blue",
    "Q2": "adidas red",
    "Q3": "denim jean skinny",
    "Q4": "dress red",
    "Q5": "leather jacket"
}

In [12]:
def calculate_tf(word, document):
    """
    Calculate term frequency for a word in a document.
    TF = Number of times term t appears in a document
    """
    return document.count(word)    
    

def calculate_idf(word, all_documents):
    """
    Calculate inverse document frequency for a word.
    IDF = log(Total number of documents / Number of documents containing term t)
    """
    num_documents_with_term = len(docs_for_term(word))
    if num_documents_with_term == 0:
        return 0
    return np.log(len(all_documents) / num_documents_with_term)

def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0
    return dot_product / (norm_vec1 * norm_vec2)


In [13]:
def rank_documents(query, documents, k):
    """
    Rank documents based on TF-IDF scores for the given query.
    Return the top k documents.
    """
    all_documents = [doc["description"] + doc["title"] + doc["brand"] for index, doc in documents.iterrows()]
    scores = []

    term_idfs = {term: calculate_idf(term, all_documents) for term in query}
    query_vector = np.array([calculate_tf(term, query) * term_idfs[term] for term in query])

    for index, doc in documents.iterrows():
        doc_vec = []
        doc_text = doc["description"] + doc["title"] + doc["brand"]
        for term in query:
            tf = calculate_tf(term, doc_text)
            if tf > 0:
                # used the formula tf = 1 + log_10(count)
                doc_vec.append((1 + np.log(tf)) * term_idfs[term])
            else:
                doc_vec.append(0)
        scores.append((doc, cosine_similarity(query_vector, np.array(doc_vec))))

    # Sort documents by score in descending order
    ranked_docs = sorted(scores, key=lambda x: x[1], reverse=True)

    return ranked_docs[:k]

In [14]:
def precision_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Precision@k.
    Precision@k = (Number of relevant documents retrieved in top k) / k
    """
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = sum(1 for doc in retrieved_at_k if doc["pid"] in relevant_docs)
    return relevant_retrieved / k if k > 0 else 0

def recall_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Recall@k.
    Recall@k = (Number of relevant documents retrieved in top k) / (Total number of relevant documents)
    """
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = sum(1 for doc in retrieved_at_k if doc["pid"] in relevant_docs)
    total_relevant = len(relevant_docs)
    return relevant_retrieved / total_relevant if total_relevant > 0 else 0

def average_precision_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Average Precision@k.
    AP@k = Average of Precision@i for each relevant document retrieved in top k
    """
    retrieved_at_k = retrieved_docs[:k]
    relevant_retrieved = 0
    precision_sum = 0

    for i, doc in enumerate(retrieved_at_k, start=1):
        if doc["pid"] in relevant_docs:
            relevant_retrieved += 1
            precision_sum += relevant_retrieved / i

    return precision_sum / relevant_retrieved if relevant_retrieved > 0 else 0

def f1_score(precision, recall):
    """
    Calculate F1 Score.
    F1 = 2 * (Precision * Recall) / (Precision + Recall)
    """
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

def f1_score_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate F1 Score at k.
    """
    precision = precision_at_k(retrieved_docs, relevant_docs, k)
    recall = recall_at_k(retrieved_docs, relevant_docs, k)
    return f1_score(precision, recall)

def mean_average_precision(retrieved_docs_list, relevant_docs_list, k):
    """
    Calculate Mean Average Precision (MAP) at k.
    MAP = Mean of Average Precision@k over all queries
    """
    ap_sum = 0
    num_queries = len(retrieved_docs_list)

    for retrieved_docs, relevant_docs in zip(retrieved_docs_list, relevant_docs_list):
        ap_sum += average_precision_at_k(retrieved_docs, relevant_docs, k)

    return ap_sum / num_queries if num_queries > 0 else 0

def reciprocal_rank(retrieved_docs, relevant_docs):
    """
    Calculate Reciprocal Rank (RR).
    RR = 1 / Rank of the first relevant document
    """
    rank = 0
    for i, doc in enumerate(retrieved_docs):
        if doc["pid"] in relevant_docs:
            rank = i + 1
            break
    return 1 / rank if rank > 0 else 0

def mean_reciprocal_rank(retrieved_docs_list, relevant_docs_list):
    """
    Calculate Mean Reciprocal Rank (MRR).
    MRR = Mean of Reciprocal Ranks over all queries
    """
    rr_sum = 0
    num_queries = len(retrieved_docs_list)

    for retrieved_docs, relevant_docs in zip(retrieved_docs_list, relevant_docs_list):
        rr_sum += reciprocal_rank(retrieved_docs, relevant_docs)

    return rr_sum / num_queries if num_queries > 0 else 0

def dcg_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Discounted Cumulative Gain (DCG) at k.
    DCG@k = Sum of (relevance of document at rank i) / log2(i + 1) for i in 1 to k
    """
    dcg = 0
    for i in range(min(k, len(retrieved_docs))):
        doc = retrieved_docs[i]
        if doc["pid"] in relevant_docs:
            relevance = 1  # we only have binary relevance
        else:
            relevance = 0
        dcg += relevance / np.log2(i + 2)  # i + 2 because i starts from 0
    return dcg

def ndcg_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Normalized Discounted Cumulative Gain (NDCG) at k.
    NDCG@k = DCG@k / IDCG@k
    """
    dcg = dcg_at_k(retrieved_docs, relevant_docs, k)
    
    ideal_retrieved_docs = [{"pid": pid} for pid in relevant_docs]
    idcg = dcg_at_k(ideal_retrieved_docs, relevant_docs, k)
    
    return dcg / idcg if idcg > 0 else 0

In [15]:
# Build a text-only view per document (description + title + brand)
ALL_TEXT_DOCS = [row["description"] + row["title"] + row["brand"] for _, row in df.iterrows()]
N_TEXT = len(ALL_TEXT_DOCS)

# Document lengths and average length
DOC_LENGTHS = np.array([len(toks) for toks in ALL_TEXT_DOCS], dtype=float)
AVG_DL = float(DOC_LENGTHS.mean()) if N_TEXT > 0 else 0.0

# Document frequency per term over the text only view
from collections import Counter
DF_TEXT = Counter()
for toks in ALL_TEXT_DOCS:
    DF_TEXT.update(set(toks))

def idf_text(term: str) -> float:
    """IDF using text only df: log(N / df)"""
    df_t = DF_TEXT.get(term, 0)
    if df_t == 0 or N_TEXT == 0:
        return 0.0
    return np.log(N_TEXT / df_t)

def idf_bm25(term: str) -> float:
    """BM25 IDF: log((N - df + 0.5) / (df + 0.5) + 1)"""
    df_t = DF_TEXT.get(term, 0)
    return np.log(((N_TEXT - df_t + 0.5) / (df_t + 0.5)) + 1.0)

In [16]:
def preprocess_query(q):
    """Accepts a string or list of strings; returns normalized tokens using the same preprocessing as documents"""
    if isinstance(q, str):
        return preprocess_text(q)
    elif isinstance(q, (list, tuple)):
        return preprocess_text(" ".join(map(str, q)))
    else:
        return []

def retrieve_conjunctive_candidates(query_tokens):
    """Return candidate doc indices (list of ints) where all query terms appear (AND)"""
    if not query_tokens:
        return []
    dids = and_query(query_tokens)
    return [int(d) for d in dids]

def doc_text_tokens(row):
    return row["description"] + row["title"] + row["brand"]

def cosine(v1, v2):
    return cosine_similarity(v1, v2)

In [17]:
def rank_tfidf_cosine(query_tokens, candidate_indices, k=10):
    # Unique terms to define vector space
    terms = list(dict.fromkeys(query_tokens))
    # Query vector (tf-idf)
    q_vec = np.array([(1 + np.log(query_tokens.count(t))) * idf_text(t) if query_tokens.count(t) > 0 else 0.0 for t in terms])

    scored = []
    for idx in candidate_indices:
        row = df.iloc[idx]
        toks = doc_text_tokens(row)
        d_vec = []
        for t in terms:
            tf = toks.count(t)
            d_vec.append(((1 + np.log(tf)) * idf_text(t)) if tf > 0 else 0.0)
        d_vec = np.array(d_vec)
        score = cosine(q_vec, d_vec)
        scored.append((row, score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:k]

def rank_bm25(query_tokens, candidate_indices, k=10, k1=1.5, b=0.75):
    scored = []
    for idx in candidate_indices:
        row = df.iloc[idx]
        toks = doc_text_tokens(row)
        dl = len(toks)
        score = 0.0
        for t in set(query_tokens):
            tf = toks.count(t)
            if tf == 0:
                continue
            idf = idf_bm25(t)
            denom = tf + k1 * (1 - b + b * (dl / (AVG_DL if AVG_DL > 0 else 1.0)))
            score += idf * (tf * (k1 + 1)) / denom
        scored.append((row, score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:k]

# Global numeric ranges for normalization
SELL_MIN, SELL_MAX = float(pd.to_numeric(df['selling_price'], errors='coerce').min()), float(pd.to_numeric(df['selling_price'], errors='coerce').max())
DISC_MIN, DISC_MAX = float(pd.to_numeric(df['discount'], errors='coerce').min()), float(pd.to_numeric(df['discount'], errors='coerce').max())
RAT_MIN, RAT_MAX = 0.0, 5.0  # ratings are on 0..5 scale

def _norm(x, lo, hi):
    try:
        xv = float(x)
    except Exception:
        xv = 0.0
    if hi <= lo:
        return 0.0
    return (xv - lo) / (hi - lo)

def rank_custom_hybrid(query_tokens, candidate_indices, k=10,
                       base='bm25', k1=1.5, b=0.75,
                       w_rating=0.30, w_discount=0.20, w_price=0.10):
    """
    Custom score = BaseTextScore * (1 + w_rating*rating_norm + w_discount*discount_norm - w_price*price_norm)
    - BaseTextScore: BM25 (default) or TF‑IDF cosine
    - Boosts: higher rating and discount help; higher price penalizes slightly
    """
    # Precompute base scores once
    if base == 'bm25':
        base_scored = rank_bm25(query_tokens, candidate_indices, k=len(candidate_indices), k1=k1, b=b)
    else:
        base_scored = rank_tfidf_cosine(query_tokens, candidate_indices, k=len(candidate_indices))

    out = []
    for row, base_score in base_scored:
        rating = 0.0 if pd.isna(row.get('average_rating', np.nan)) else float(row['average_rating'])
        discount = row.get('discount', 0)
        price = row.get('selling_price', 0)

        rating_n = _norm(rating, RAT_MIN, RAT_MAX)
        discount_n = _norm(discount, DISC_MIN, DISC_MAX)
        price_n = _norm(price, SELL_MIN, SELL_MAX)

        factor = 1.0 + (w_rating * rating_n) + (w_discount * discount_n) - (w_price * price_n)
        final_score = float(base_score) * max(factor, 0.0)
        out.append((row, final_score))

    return sorted(out, key=lambda x: x[1], reverse=True)[:k]

In [18]:
def top_k_pid_title(scored, k=10):
    out = []
    for row, s in scored[:k]:
        title_str = " ".join(row['title']) if isinstance(row['title'], list) else str(row['title'])
        out.append((row['pid'], round(float(s), 4), title_str))
    return out

demo_results = {}
for qid, q_terms in test_queries.items():
    q_tokens = preprocess_query(q_terms)
    cand_idx = retrieve_conjunctive_candidates(q_tokens)
    tfidf_res = rank_tfidf_cosine(q_tokens, cand_idx, k=10)
    bm25_res = rank_bm25(q_tokens, cand_idx, k=10)
    custom_res = rank_custom_hybrid(q_tokens, cand_idx, k=10, base='bm25')


In [19]:
from gensim.models import Word2Vec


w2v_dim = 100
w2v_model = Word2Vec(
    sentences=ALL_TEXT_DOCS,
    vector_size=w2v_dim,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    epochs=10
)

In [20]:
def average_w2v(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return None
    return np.mean(vecs, axis=0)

def rank_w2v_cosine(query_tokens, candidate_indices, k=20):
    q_vec = average_w2v(query_tokens, w2v_model)
    if q_vec is None:
        return []
    scored = []
    for idx in candidate_indices:
        row = df.iloc[idx]
        d_vec = average_w2v(doc_text_tokens(row), w2v_model)
        if d_vec is None:
            continue
        score = cosine(q_vec, d_vec)
        scored.append((row, score))
    return sorted(scored, key=lambda x: x[1], reverse=True)[:k]

In [21]:
w2v_results = {}
for qid, q_terms in test_queries.items():
    q_tokens = preprocess_query(q_terms)
    cand_idx = retrieve_conjunctive_candidates(q_tokens)
    w2v_scored = rank_w2v_cosine(q_tokens, cand_idx, k=20)
    w2v_results[qid] = [(row['pid'], round(float(score), 4), " ".join(row['title']) if isinstance(row['title'], list) else str(row['title'])) for row, score in w2v_scored]

## RAG & Search Engine

In [32]:
# Configuration
DATA_PATH = "../../data/fashion_products_dataset.json"
from dotenv import load_dotenv
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [23]:
class SearchEngine:
    def __init__(self, file_path):
        self.df = self.load_and_clean_data(file_path)
        self.index = self.build_index()
        self.doc_lengths = self.compute_doc_lengths()
        self.avg_dl = float(np.mean(self.doc_lengths))
        self.N = len(self.df)
        print("Search Engine Initialized.")

    def preprocess_text(self, text):
        if not isinstance(text, str): return []
        tokens = nltk.word_tokenize(text)
        tokens = [t.lower() for t in tokens]
        tokens = [re.sub(r"[^\w\s]", "", t) for t in tokens]
        stop_words = set(stopwords.words('english'))
        tokens = [t for t in tokens if t not in stop_words and t]
        stemmer = PorterStemmer()
        return [stemmer.stem(t) for t in tokens]

    def load_and_clean_data(self, path):
        with open(path, "r") as f:
            data = json.load(f)
        df = pd.DataFrame(data)
        
        def clean_row(row):
            # 1. Prices
            try:
                sp = float(str(row['selling_price']).replace(',','')) if row['selling_price'] else 0.0
            except: sp = 0.0
            
            disc = 0
            if isinstance(row['discount'], str) and '%' in row['discount']:
                 disc = int(re.search(r'\d+', row['discount']).group())
            
            ap = row['actual_price']
            if not ap:
                ap = sp * (1 + disc/100) if disc > 0 else sp
            else:
                try: ap = float(str(ap).replace(',',''))
                except: ap = sp

            # 2. Text
            row['processed_title'] = self.preprocess_text(row['title'])
            row['processed_desc'] = self.preprocess_text(row['description'])
            brand = row['brand'] if row['brand'] else "no brand"
            row['processed_brand'] = [brand.lower()]
            
            # Combine for indexing
            row['all_tokens'] = row['processed_title'] + row['processed_desc'] + row['processed_brand']
            
            # 3. Numeric Types for sorting
            row['selling_price_val'] = sp
            row['actual_price_val'] = ap
            row['discount_val'] = disc
            try: row['rating_val'] = float(row['average_rating'])
            except: row['rating_val'] = 0.0
            
            return row

        df = df.apply(clean_row, axis=1)
        # Generate a unique string ID for URLs if pid is missing or complex
        df['uid'] = [str(uuid.uuid4()) for _ in range(len(df))] 
        return df

    def build_index(self):
        inv_index = defaultdict(list)
        for idx, row in self.df.iterrows():
            # Use set for boolean retrieval to avoid duplicates per doc
            for term in set(row['all_tokens']):
                inv_index[term].append(idx)
        return inv_index

    def compute_doc_lengths(self):
        return np.array([len(tokens) for tokens in self.df['all_tokens']])

    def get_idf(self, term):
        df_count = len(self.index.get(term, []))
        return np.log(1 + (self.N - df_count + 0.5) / (df_count + 0.5))

    def search(self, query, k=20):
        """
        Implements BM25 + Boosting (Your 'Custom' logic).
        """
        q_tokens = self.preprocess_text(query)
        
        # 1. Retrieve (Conjunctive OR to ensure recall, ranked by BM25)
        doc_scores = defaultdict(float)
        k1, b = 1.5, 0.75
        
        relevant_indices = set()
        for t in q_tokens:
            relevant_indices.update(self.index.get(t, []))
            
        if not relevant_indices:
            return []

        for idx in relevant_indices:
            doc_tokens = self.df.iloc[idx]['all_tokens']
            dl = len(doc_tokens)
            score = 0
            for t in q_tokens:
                tf = doc_tokens.count(t) # Note: In prod, pre-calculate TF
                if tf > 0:
                    idf = self.get_idf(t)
                    num = tf * (k1 + 1)
                    den = tf + k1 * (1 - b + b * (dl / self.avg_dl))
                    score += idf * (num / den)
            
            # 2. Custom Boosting (Rating & Discount)
            row = self.df.iloc[idx]
            boost = 1.0
            if row['rating_val'] > 4.0: boost += 0.2
            if row['discount_val'] > 30: boost += 0.1
            
            doc_scores[idx] = score * boost

        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:k]
        
        results = []
        for idx, score in sorted_docs:
            results.append(self.df.iloc[idx].to_dict())
            results[-1]['search_score'] = score
            
        return results

# Initialize Engine
engine = SearchEngine(DATA_PATH)

Search Engine Initialized.


In [33]:
from groq import Groq

class RAGSystem:
    def __init__(self, api_key):
        self.api_key = api_key
        self.client = None
        
        try:
            self.client = Groq(api_key=api_key)
        except Exception as e:
            print(f"Error initializing Groq Client: {e}")

    def generate_summary(self, query, top_results):
        """
        Improvement 1: Context Pruning. Only take top 5 results.
        Improvement 2: Metadata Injection. Include price/rating in context.
        """
        if not top_results:
            return "No products found to summarize."

        context_text = ""
        for i, res in enumerate(top_results[:5]): # Only top 5
            context_text += f"Item {i+1}: {res['title']}. Price: {res['selling_price']}. Rating: {res['average_rating']}. Description: {res['description'][:200]}...\n"

        # Improvement 3: Refined Prompt
        prompt = f"""
        You are an expert Fashion Shopping Assistant. 
        User Query: "{query}"
        
        Based ONLY on the following available products, provide a 3-sentence summary helping the user choose. 
        Highlight the best value (high rating + low price) if it exists.
        
        Products:
        {context_text}
        
        Summary:
        """

        try:
            if self.api_key == "YOUR_GROQ_API_KEY":
                return "LLM Summary Placeholder: Please insert a valid Groq API Key."
            
            if self.client is None:
                return "Error: Groq Client not initialized. Check your API Key."

            # 3. Call the Groq API
            # Note: The syntax remains nearly identical to OpenAI
            response = self.client.chat.completions.create(
                # 4. Use a Groq-supported model (e.g., Llama 3 or Mixtral)
                model="moonshotai/kimi-k2-instruct-0905", 
                messages=[{"role": "user", "content": prompt}],
                max_tokens=150
            )
            
            # Access response attributes (Standard OpenAI format)
            return response.choices[0].message.content.strip()
            
        except Exception as e:
            return f"Error generating summary: {str(e)}"

rag = RAGSystem(GROQ_API_KEY)

In [25]:
class AnalyticsStore:
    def __init__(self):
        # 1. Sessions (User Context)
        self.sessions = [] # [{'session_id', 'user_agent', 'timestamp', 'ip'}]
        
        # 2. Requests (Search Queries)
        self.searches = [] # [{'search_id', 'session_id', 'query', 'timestamp', 'num_results'}]
        
        # 3. Clicks (Interaction)
        self.clicks = []   # [{'session_id', 'search_id', 'doc_uid', 'rank', 'timestamp', 'dwell_time'}]

    def track_session(self, user_agent, ip):
        sid = str(uuid.uuid4())
        self.sessions.append({
            'session_id': sid,
            'user_agent': user_agent,
            'ip': ip,
            'timestamp': datetime.datetime.now().isoformat()
        })
        return sid

    def track_search(self, session_id, query, num_results):
        search_id = str(uuid.uuid4())
        self.searches.append({
            'search_id': search_id,
            'session_id': session_id,
            'query': query,
            'num_results': num_results,
            'timestamp': datetime.datetime.now().isoformat()
        })
        return search_id

    def track_click(self, session_id, search_id, doc_uid, rank):
        self.clicks.append({
            'session_id': session_id,
            'search_id': search_id,
            'doc_uid': doc_uid,
            'rank': rank,
            'timestamp': datetime.datetime.now().isoformat(),
            'dwell_time': 0 # Updated later via Beacon
        })

    def update_dwell_time(self, session_id, doc_uid, duration):
        # Find the most recent click for this session/doc and update
        for click in reversed(self.clicks):
            if click['session_id'] == session_id and click['doc_uid'] == doc_uid:
                click['dwell_time'] = duration
                break

    def get_stats(self):
        total_searches = len(self.searches)
        total_clicks = len(self.clicks)
        
        # Top Queries
        queries = [s['query'] for s in self.searches]
        top_queries = Counter(queries).most_common(5)
        
        # Click Through Rate (CTR)
        ctr = (total_clicks / total_searches * 100) if total_searches > 0 else 0
        
        return {
            'total_searches': total_searches,
            'total_clicks': total_clicks,
            'ctr': round(ctr, 2),
            'top_queries': top_queries,
            'raw_sessions': self.sessions[-5:], # Last 5
            'raw_clicks': self.clicks[-5:]
        }

analytics = AnalyticsStore()

In [26]:
# --- CSS STYLES ---
CSS = """
<style>
    body { font-family: 'Segoe UI', sans-serif; max-width: 1000px; margin: 0 auto; padding: 20px; background-color: #f9f9f9; }
    .header { text-align: center; margin-bottom: 40px; }
    .search-box { display: flex; justify-content: center; gap: 10px; margin-bottom: 30px; }
    input[type="text"] { width: 60%; padding: 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 16px; }
    button { padding: 12px 24px; background-color: #333; color: white; border: none; border-radius: 4px; cursor: pointer; }
    button:hover { background-color: #555; }
    
    .rag-box { background: #e8f4fd; border: 1px solid #b6dbf9; padding: 15px; border-radius: 8px; margin-bottom: 20px; }
    .rag-title { font-weight: bold; color: #0056b3; margin-bottom: 5px; }
    
    .result-item { background: white; padding: 20px; margin-bottom: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
    .result-title a { font-size: 18px; color: #333; text-decoration: none; font-weight: bold; }
    .result-meta { color: #666; font-size: 14px; margin: 5px 0; }
    .price { color: #d9534f; font-weight: bold; }
    
    .dashboard-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
    .card { background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
    table { width: 100%; border-collapse: collapse; }
    th, td { padding: 8px; text-align: left; border-bottom: 1px solid #ddd; }
</style>
"""

# --- TEMPLATES ---

HOME_TEMPLATE = """
<!DOCTYPE html>
<html>
<head><title>Fashion Search</title>""" + CSS + """</head>
<body>
    <div class="header">
        <h1>Fashion Search Engine</h1>
        <p>Find the best clothes with AI</p>
    </div>
    <form action="/search" method="get" class="search-box">
        <input type="text" name="q" placeholder="e.g. Red cotton summer dress" required>
        <button type="submit">Search</button>
    </form>
    <div style="text-align:center;">
        <a href="/dashboard">View Analytics Dashboard</a>
    </div>
</body>
</html>
"""

RESULTS_TEMPLATE = """
<!DOCTYPE html>
<html>
<head><title>Results for {{ query }}</title>""" + CSS + """</head>
<body>
    <div class="header">
        <form action="/search" method="get" class="search-box">
            <input type="text" name="q" value="{{ query }}">
            <button type="submit">Search</button>
        </form>
    </div>

    <div class="rag-box">
        <div class="rag-title">AI Summary</div>
        <div>{{ summary }}</div>
    </div>

    {% for doc in results %}
    <div class="result-item">
        <div class="result-title">
            <a href="/product/{{ doc.uid }}?sid={{ session_id }}&qid={{ search_id }}&rank={{ loop.index }}" 
               onclick="trackClick('{{ session_id }}', '{{ search_id }}', '{{ doc.uid }}', {{ loop.index }})">
               {{ doc.title }}
            </a>
        </div>
        <div class="result-meta">
            <span class="price">{{ doc.selling_price }}</span> 
            {% if doc.discount_val > 0 %} <span style="color:green">({{ doc.discount }} off)</span> {% endif %}
            | Rating: {{ doc.average_rating }} ★ | Brand: {{ doc.brand }}
        </div>
        <div style="color:#555;">{{ doc.description[:150] }}...</div>
    </div>
    {% endfor %}
    
    <script>
    function trackClick(sid, qid, docId, rank) {
        // We use the href for navigation, but we could fire an async fetch here if we prevented default
        // The simple link with query params handles the tracking on the server side /product route
    }
    </script>
</body>
</html>
"""

DETAILS_TEMPLATE = """
<!DOCTYPE html>
<html>
<head><title>{{ doc.title }}</title>""" + CSS + """</head>
<body>
    <a href="javascript:history.back()">← Back to results</a>
    
    <div class="card" style="margin-top:20px;">
        <h1>{{ doc.title }}</h1>
        <h2 class="price">{{ doc.selling_price }}</h2>
        <p><strong>Brand:</strong> {{ doc.brand }}</p>
        <p><strong>Rating:</strong> {{ doc.average_rating }} / 5.0</p>
        <hr>
        <h3>Description</h3>
        <p>{{ doc.description }}</p>
        <br>
        <h3>Product Details</h3>
        <p>{{ doc.product_details }}</p>
        <br>
        <a href="{{ doc.url }}" target="_blank" style="background:black; color:white; padding:10px 20px; text-decoration:none; border-radius:4px;">Buy on Original Site</a>
    </div>

    <script>
        // WEB ANALYTICS: Dwell Time Tracking
        let startTime = Date.now();
        let sid = "{{ session_id }}";
        let docUid = "{{ doc.uid }}";

        window.addEventListener("beforeunload", function() {
            let endTime = Date.now();
            let duration = (endTime - startTime) / 1000; // seconds
            
            // Send beacon (reliable on page unload)
            navigator.sendBeacon("/track_dwell", JSON.stringify({
                session_id: sid,
                doc_uid: docUid,
                duration: duration
            }));
        });
    </script>
</body>
</html>
"""

DASHBOARD_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <title>Analytics Dashboard</title>
    """ + CSS + """
    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
    <style>
        .chart-container {
            position: relative; 
            height: 300px; 
            width: 100%; 
            margin-bottom: 20px;
        }
    </style>
</head>
<body>
    <h1>Web Analytics Dashboard</h1>
    <a href="/">← Back to Search</a>
    <br><br>

    <div class="dashboard-grid">
        <div class="card">
            <h3>Key Metrics</h3>
            <p><strong>Total Searches:</strong> {{ stats.total_searches }}</p>
            <p><strong>Total Clicks:</strong> {{ stats.total_clicks }}</p>
            <p><strong>CTR:</strong> {{ stats.ctr }}%</p>
        </div>
        
        <div class="card">
            <h3>Popular Trends</h3>
            <div class="chart-container">
                <canvas id="queryChart"></canvas>
            </div>
            <p style="font-size: 0.9em; color: #666;">
                *Visualizes the top 5 most frequent search terms.
            </p>
        </div>
    </div>
    
    <br>

    <div class="card">
        <h3>User Activity Timeline</h3>
        <p style="font-size: 0.9em; color: #666;">
            Comparing search volume vs. product clicks over time (per minute).
        </p>
        <div class="chart-container">
            <canvas id="timeChart"></canvas>
        </div>
    </div>

    <br>
    
    <div class="card">
        <h3>Recent Interactions (Log)</h3>
        <table>
            <tr><th>Time</th><th>Type</th><th>Detail</th></tr>
            {% for c in stats.raw_clicks|reverse %}
            <tr>
                <td>{{ c.timestamp }}</td>
                <td>CLICK</td>
                <td>Rank {{ c.rank }} (Dwell: {{ c.dwell_time|round(1) }}s)</td>
            </tr>
            {% endfor %}
            {% for s in stats.raw_sessions|reverse %}
            <tr>
                <td>{{ s.timestamp }}</td>
                <td>SESSION</td>
                <td>{{ s.user_agent[:30] }}...</td>
            </tr>
            {% endfor %}
        </table>
    </div>

    <script>
        // Parse data passed from Flask
        const chartData = {{ chart_data | tojson }};

        // 1. TOP QUERIES BAR CHART
        const ctxQuery = document.getElementById('queryChart').getContext('2d');
        new Chart(ctxQuery, {
            type: 'bar',
            data: {
                labels: chartData.q_labels,
                datasets: [{
                    label: 'Search Count',
                    data: chartData.q_data,
                    backgroundColor: 'rgba(54, 162, 235, 0.6)',
                    borderColor: 'rgba(54, 162, 235, 1)',
                    borderWidth: 1
                }]
            },
            options: {
                responsive: true,
                maintainAspectRatio: false,
                scales: { y: { beginAtZero: true, ticks: { stepSize: 1 } } }
            }
        });

        // 2. ACTIVITY LINE CHART
        const ctxTime = document.getElementById('timeChart').getContext('2d');
        new Chart(ctxTime, {
            type: 'line',
            data: {
                labels: chartData.time_labels,
                datasets: [
                    {
                        label: 'Searches',
                        data: chartData.time_searches,
                        borderColor: 'rgba(54, 162, 235, 1)',
                        backgroundColor: 'rgba(54, 162, 235, 0.2)',
                        tension: 0.3,
                        fill: true
                    },
                    {
                        label: 'Clicks',
                        data: chartData.time_clicks,
                        borderColor: 'rgba(255, 99, 132, 1)',
                        backgroundColor: 'rgba(255, 99, 132, 0.2)',
                        tension: 0.3,
                        fill: true
                    }
                ]
            },
            options: {
                responsive: true,
                maintainAspectRatio: false,
                scales: { y: { beginAtZero: true, ticks: { stepSize: 1 } } }
            }
        });
    </script>

</body>
</html>
"""

In [27]:
from collections import defaultdict

def prepare_chart_data(analytics):
    """
    Prepares data for Chart.js.
    1. Top Queries: Extract labels and counts.
    2. Activity Timeline: Align searches and clicks by minute.
    """
    # --- Chart 1: Top Queries ---
    # analytics.get_stats() returns top_queries as [('query', count), ...]
    stats = analytics.get_stats()
    top_queries = stats['top_queries']
    q_labels = [q[0] for q in top_queries]
    q_data = [q[1] for q in top_queries]

    # --- Chart 2: Activity Over Time ---
    # We need to group events by minute to plot them on a shared timeline
    activity_buckets = defaultdict(lambda: {'searches': 0, 'clicks': 0})
    
    # Process Searches
    for s in analytics.searches:
        # ISO format is YYYY-MM-DDTHH:MM:SS... -> Slice to minute YYYY-MM-DDTHH:MM
        time_bucket = s['timestamp'][:16].replace('T', ' ')
        activity_buckets[time_bucket]['searches'] += 1
        
    # Process Clicks
    for c in analytics.clicks:
        time_bucket = c['timestamp'][:16].replace('T', ' ')
        activity_buckets[time_bucket]['clicks'] += 1

    # Sort by time so the line chart moves left to right
    sorted_times = sorted(activity_buckets.keys())
    
    # Create aligned arrays
    timeline_labels = sorted_times
    timeline_searches = [activity_buckets[t]['searches'] for t in sorted_times]
    timeline_clicks = [activity_buckets[t]['clicks'] for t in sorted_times]

    return {
        'q_labels': q_labels,
        'q_data': q_data,
        'time_labels': timeline_labels,
        'time_searches': timeline_searches,
        'time_clicks': timeline_clicks
    }

In [34]:
app = Flask(__name__)

@app.before_request
def ensure_session():
    # Simple session tracking via cookie logic (simplified)
    pass

@app.route('/')
def home():
    # Start a session
    sid = analytics.track_session(request.headers.get('User-Agent'), request.remote_addr)
    return render_template_string(HOME_TEMPLATE)

@app.route('/search')
def search():
    query = request.args.get('q', '')
    
    # 1. Analytics: Get or Create Session
    sid = analytics.track_session(request.headers.get('User-Agent'), request.remote_addr)
    
    # 2. Search Engine
    results = engine.search(query, k=15)
    
    # 3. Analytics: Track Search
    search_id = analytics.track_search(sid, query, len(results))
    
    # 4. RAG: Generate Summary
    summary = rag.generate_summary(query, results)
    
    return render_template_string(
        RESULTS_TEMPLATE, 
        query=query, 
        results=results, 
        summary=summary,
        session_id=sid,
        search_id=search_id
    )

@app.route('/product/<uid>')
def product_detail(uid):
    # Retrieve query params for analytics
    sid = request.args.get('sid', 'unknown')
    qid = request.args.get('qid', 'unknown')
    rank = request.args.get('rank', 0)
    
    # 1. Analytics: Track Click
    analytics.track_click(sid, qid, uid, rank)
    
    # 2. Find Document
    doc = engine.df[engine.df['uid'] == uid].iloc[0].to_dict()
    
    return render_template_string(
        DETAILS_TEMPLATE, 
        doc=doc, 
        session_id=sid
    )

@app.route('/track_dwell', methods=['POST'])
def track_dwell():
    # Endpoint for Beacon API
    data = json.loads(request.data)
    analytics.update_dwell_time(data['session_id'], data['doc_uid'], data['duration'])
    return jsonify({"status": "success"})

@app.route('/dashboard')
def dashboard():
    stats = analytics.get_stats()
    
    # New: Prepare data for charts
    chart_data = prepare_chart_data(analytics)
    
    return render_template_string(
        DASHBOARD_TEMPLATE, 
        stats=stats,
        chart_data=chart_data # Pass this new variable
    )
if __name__ == '__main__':
    # Run the server
    print("Starting Flask Server...")
    app.run(host='0.0.0.0', port=5000)

Starting Flask Server...
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.80.134.73:5000
Press CTRL+C to quit
127.0.0.1 - - [03/Dec/2025 14:56:16] "GET /search?q=men+jeans HTTP/1.1" 200 -
127.0.0.1 - - [03/Dec/2025 14:56:25] "GET /product/c4c281e3-adc5-45d8-b935-a9351e3cf625?sid=f695c40c-8a6b-4c9f-9489-8eeb63f482b3&qid=12c216da-fdc3-477a-9395-9532df0ac829&rank=1 HTTP/1.1" 200 -
127.0.0.1 - - [03/Dec/2025 14:56:28] "POST /track_dwell HTTP/1.1" 200 -
127.0.0.1 - - [03/Dec/2025 14:56:30] "GET /dashboard HTTP/1.1" 200 -
