Data Preparation (Pandas + NLTK )   
Clean and Normalize Data (Local with Pandas + NLTK)

In [None]:
# Install packages
%pip install pandas nltk pyspark elasticsearch sentence-transformers xgboost langchain openai faiss-cpu --quiet


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
df1 = pd.read_csv("nike-data-1.csv")
df2 = pd.read_csv("nike-data-2.csv")

# Merge
if 'uniq_id' in df2.columns:
    df2 = df2.drop(columns=['uniq_id'])
df1 = df1.reset_index(drop=True)
df2 = df2.reset_index(drop=True)
combined_df = pd.concat([df2, df1], axis=1)
combined_df.insert(0, 'new_id', range(1, len(combined_df) + 1))

# Preprocess text
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens]
    return ' '.join(tokens)

combined_df['processed_title'] = combined_df['Title'].apply(preprocess_text)
combined_df['processed_description'] = combined_df['Product Description'].apply(preprocess_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("SearchPreprocessing").getOrCreate()
df_spark = spark.createDataFrame(combined_df)
df_spark.printSchema()



root
 |-- new_id: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Subtitle: string (nullable = true)
 |-- Product Description: string (nullable = true)
 |-- url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- sub_title: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- model: double (nullable = true)
 |-- color: string (nullable = true)
 |-- price: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- description: string (nullable = true)
 |-- raw_description: string (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- review_count: double (nullable = true)
 |-- images: string (nullable = true)
 |-- available_sizes: string (nullable = true)
 |-- uniq_id: string (nullable = true)
 |-- scraped_at: string (nullable = true)
 |-- processed_title: string (nullable = true)
 |-- processed_description: string (nullable = true)



 Sentence Embedding + FAISS Index

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

# Load model and encode
model = SentenceTransformer('all-MiniLM-L6-v2')
descriptions = combined_df['processed_description'].tolist()
embeddings = model.encode(descriptions, show_progress_bar=True)

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# Build index
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Feature Generation for Query

In [None]:
from sentence_transformers import util
import numpy as np

def compute_features(query, top_k=20):
    query_vec = model.encode([query])
    faiss.normalize_L2(query_vec)
    D, I = index.search(query_vec, top_k)

    results = []
    for idx, score in zip(I[0], D[0]):
        row = combined_df.iloc[idx]
        semantic_sim = float(score)

        # Optional keyword match
        query_tokens = set(query.lower().split())
        text_tokens = set(row['processed_description'].lower().split())
        keyword_match_score = len(query_tokens & text_tokens) / len(query_tokens) if query_tokens else 0.0

        results.append({
            "new_id": row["new_id"],
            "Title": row["Title"],
            "semantic_sim": semantic_sim,
            "keyword_match_score": keyword_match_score
        })

    return pd.DataFrame(results)


Assign Relevance Labels

In [None]:

results_df = compute_features(query="your subcategory here", top_k=20)

def assign_relevance(row):
    if row['semantic_sim'] > 0.6:
        return 3
    elif row['semantic_sim'] > 0.4:
        return 2
    else:
        return 1

results_df['relevance'] = results_df.apply(assign_relevance, axis=1)
print(results_df['relevance'].value_counts())




relevance
1    20
Name: count, dtype: int64


Assign Relevance Labels

In [None]:
from sentence_transformers import util

# Ensure the column name for subcategory is correct (e.g., 'sub_title' or 'subcategory')
search_col = 'sub_title'  # change this to your actual subcategory column name

def compute_row_semantic_sim(row):
    query_emb = model.encode(str(row[search_col]))
    desc_emb = model.encode(str(row['processed_description']))
    return util.cos_sim(query_emb, desc_emb).item()

combined_df['semantic_sim'] = combined_df.apply(compute_row_semantic_sim, axis=1)

# Assign relevance based on semantic similarity
combined_df['relevance'] = 1  # default
combined_df.loc[combined_df['semantic_sim'] > 0.6, 'relevance'] = 3
combined_df.loc[(combined_df['semantic_sim'] > 0.4) & (combined_df['semantic_sim'] <= 0.6), 'relevance'] = 2

print(combined_df['relevance'].value_counts())



relevance
1    391
2      9
Name: count, dtype: int64


Train XGBoost Ranker

In [None]:
def compute_keyword_match_score(row):
    query_tokens = set(str(row[search_col]).lower().split())
    desc_tokens = set(str(row['processed_description']).lower().split())
    if query_tokens:
        return len(query_tokens & desc_tokens) / len(query_tokens)
    else:
        return 0.0

combined_df['keyword_match_score'] = combined_df.apply(compute_keyword_match_score, axis=1)


In [None]:
from xgboost import XGBRanker
from sklearn.model_selection import train_test_split

features = ['keyword_match_score', 'semantic_sim']
X = combined_df[features]
y = combined_df['relevance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Since it's from one query, group is just the whole training set
train_group = [len(X_train)]

model_xgb = XGBRanker(objective='rank:pairwise', n_estimators=100)
model_xgb.fit(X_train, y_train, group=train_group)

print(y_train.value_counts())



relevance
1    314
2      6
Name: count, dtype: int64


In [None]:
y_pred = model_xgb.predict(X_test)
X_test = X_test.copy()
X_test['y_true'] = y_test.values
X_test['y_pred'] = y_pred

# Top predictions
X_test.sort_values("y_pred", ascending=False).head(5)


Unnamed: 0,keyword_match_score,semantic_sim,y_true,y_pred
12,0.2,0.410618,2,3.25796
72,0.0,0.458807,2,3.25796
107,0.0,0.58346,2,3.25796
69,0.0,0.045934,1,-3.257959
52,0.0,0.120018,1,-3.257959


Ranking Metrics   Evaluation Metrics (Precision@K, Recall@K, MRR, NDCG)

In [None]:
import numpy as np
from sklearn.metrics import ndcg_score

def precision_at_k(y_true, y_pred, k=10, rel_threshold=1):
    order = np.argsort(y_pred)[::-1][:k]
    y_true_k = y_true[order]
    return np.sum(y_true_k > rel_threshold) / k

def recall_at_k(y_true, y_pred, k=10, rel_threshold=1):
    order = np.argsort(y_pred)[::-1][:k]
    y_true_k = y_true[order]
    num_relevant = np.sum(y_true > rel_threshold)
    if num_relevant == 0:
        return 0.0
    return np.sum(y_true_k > rel_threshold) / num_relevant

def mean_reciprocal_rank(y_true, y_pred, rel_threshold=1):
    order = np.argsort(y_pred)[::-1]
    y_true_ordered = y_true[order]
    for i, rel in enumerate(y_true_ordered, start=1):
        if rel > rel_threshold:
            return 1.0 / i
    return 0.0

def evaluate_group(y_true, y_pred, k=10, rel_threshold=1):
    p = precision_at_k(y_true, y_pred, k, rel_threshold)
    r = recall_at_k(y_true, y_pred, k, rel_threshold)
    mrr = mean_reciprocal_rank(y_true, y_pred, rel_threshold)
    ndcg = ndcg_score([y_true], [y_pred], k=k)
    return p, r, mrr, ndcg

features = ['keyword_match_score', 'semantic_sim']

# Make sure to select only the features for prediction
X_test_features = X_test[features]
y_true = y_test.values

y_pred = model_xgb.predict(X_test_features)

# Define group_test - size of each query group in test set, adjust as needed
group_test = [10] * (len(X_test_features) // 10)  # example assuming groups of size 10

start = 0
for idx, group_size in enumerate(group_test):
    end = start + group_size
    y_true_group = y_true[start:end]
    y_pred_group = y_pred[start:end]

    p, r, mrr, ndcg = evaluate_group(y_true_group, y_pred_group, k=10, rel_threshold=1)

    print(f"Group {idx + 1} Metrics:")
    print(f"  Precision@10: {p:.4f}")
    print(f"  Recall@10: {r:.4f}")
    print(f"  MRR: {mrr:.4f}")
    print(f"  NDCG@10: {ndcg:.4f}\n")

    start = end

print("Example y_true and y_pred from group 1:")
print("y_true:", y_true[0:10])
print("y_pred:", y_pred[0:10])


Group 1 Metrics:
  Precision@10: 0.0000
  Recall@10: 0.0000
  MRR: 0.0000
  NDCG@10: 1.0000

Group 2 Metrics:
  Precision@10: 0.0000
  Recall@10: 0.0000
  MRR: 0.0000
  NDCG@10: 1.0000

Group 3 Metrics:
  Precision@10: 0.1000
  Recall@10: 1.0000
  MRR: 1.0000
  NDCG@10: 1.0000

Group 4 Metrics:
  Precision@10: 0.0000
  Recall@10: 0.0000
  MRR: 0.0000
  NDCG@10: 1.0000

Group 5 Metrics:
  Precision@10: 0.2000
  Recall@10: 1.0000
  MRR: 1.0000
  NDCG@10: 1.0000

Group 6 Metrics:
  Precision@10: 0.0000
  Recall@10: 0.0000
  MRR: 0.0000
  NDCG@10: 1.0000

Group 7 Metrics:
  Precision@10: 0.0000
  Recall@10: 0.0000
  MRR: 0.0000
  NDCG@10: 1.0000

Group 8 Metrics:
  Precision@10: 0.0000
  Recall@10: 0.0000
  MRR: 0.0000
  NDCG@10: 1.0000

Example y_true and y_pred from group 1:
y_true: [1 1 1 1 1 1 1 1 1 1]
y_pred: [-3.2579587 -3.2579587 -3.2579587 -3.2579587 -3.2579587 -3.2579587
 -3.2579587 -3.2579587 -3.2579587 -3.2579587]


Save the trained model as a .pkl file

In [None]:
import pickle

# Assume your trained model is called `model_xgb`
with open('xgbranker_model.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)

print("Model saved as xgbranker_model.pkl")


Model saved as xgbranker_model.pkl
