In [1]:
import pandas as pd 
import numpy as np  
import faiss 
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [None]:
from sentence_transformers import SentenceTransformer

In [4]:
dataset = pd.read_csv("Dataset/final_dataset.csv")
dataset.head(2)

Unnamed: 0,uniq_id,product_id,product_category,product_brand,product_name,product_price,product_description,product_currency,product_tags,product_reviews_count,...,product_image_url_jpeg,popularity_score,brand_mean_rating,category_mean_rating,crawl_month,crawl_dayofweek,categorical_L1,categorical_L2,categorical_L3,categorical_L4
0,2612,971,premium beauty premium makeup premium lips pre...,NARS,nars velvet matte lip pencil endangered red,11.25,,USD,nars velvet matte lip pencil endangered red wa...,22.0,...,https://i5.walmartimages.com/asr/44271b01-c56e...,13.47,4.44,4.29,10,4,Premium Beauty,Premium Makeup,Premium Lips,Premium Lip Liner
1,3524,2947,premium beauty premium makeup premium nail pol...,OPI,opi nail polish lacquer holiday hello kitty 5o...,10.95,,USD,opi nail polish lacquer holiday hello kitty 5o...,4.0,...,https://i5.walmartimages.com/asr/1d59e429-8acb...,4.83,4.28,3.0,7,4,Premium Beauty,Premium Makeup,Premium Nail Polish & Care,Premium Nail Kits


In [5]:
## just edge cases 

dataset = dataset.drop_duplicates("product_id").reset_index(drop = True)


## spletting the dataset for no data leakage later 
train_ids , validate_ids = train_test_split(dataset["product_id"].unique() , test_size = 0.2 , random_state = 42)


train_df = dataset[dataset["product_id"].isin(train_ids)].reset_index(drop= True)
validate_df = dataset[dataset["product_id"].isin(validate_ids)].reset_index(drop = True)

In [6]:
dataset.columns

Index(['uniq_id', 'product_id', 'product_category', 'product_brand',
       'product_name', 'product_price', 'product_description',
       'product_currency', 'product_tags', 'product_reviews_count',
       'crawl_date', 'crawl_time', 'rating_for_model',
       'product_image_url_jpeg', 'popularity_score', 'brand_mean_rating',
       'category_mean_rating', 'crawl_month', 'crawl_dayofweek',
       'categorical_L1', 'categorical_L2', 'categorical_L3', 'categorical_L4'],
      dtype='object')

In [7]:
def features(dataset):
    dataset = dataset.copy()

    #just an edge case we have already clean the data 
    dataset["product_description"] = dataset["product_description"].fillna("")
    dataset["product_brand"]       = dataset["product_brand"].fillna("NA")
    dataset["product_category"]    = dataset["product_category"].fillna("NA")
    dataset["rating_for_model"]    = dataset["rating_for_model"].fillna(0)
    dataset["product_reviews_count"] = dataset["product_reviews_count"].fillna(0)

    dataset["text"] =( dataset["product_name"] + " " + dataset["product_brand"] + " " +
                      dataset["product_category"] + " "+ dataset["product_description"]
                     )
    return dataset

train_df = features(train_df)
validate_df = features(validate_df)

In [8]:
train_df.columns

Index(['uniq_id', 'product_id', 'product_category', 'product_brand',
       'product_name', 'product_price', 'product_description',
       'product_currency', 'product_tags', 'product_reviews_count',
       'crawl_date', 'crawl_time', 'rating_for_model',
       'product_image_url_jpeg', 'popularity_score', 'brand_mean_rating',
       'category_mean_rating', 'crawl_month', 'crawl_dayofweek',
       'categorical_L1', 'categorical_L2', 'categorical_L3', 'categorical_L4',
       'text'],
      dtype='object')

In [9]:
## Bayesain rating formula
# (v / (v + m)) * R + (m / (v + m)) * C  - R = product avg rating , v = no. reviews , c = global mean rating , m = minimum reviews to trust a rating 

In [10]:
c = train_df["rating_for_model"].mean()
m = train_df["product_reviews_count"].quantile(0.7)


def bayesain_rating(rows):
    v = rows["product_reviews_count"]
    R = rows["rating_for_model"]

    return (v/(v+ m))*R + (m/(v + m))*c

train_df["bayesain_rating"] = train_df.apply(bayesain_rating , axis = 1)
validate_df["bayesain_rating"] = validate_df.apply(bayesain_rating , axis = 1)


train_df["popularity"] = np.log1p(train_df["product_reviews_count"])
validate_df["popularity"] = np.log1p(validate_df["product_reviews_count"])

In [11]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

train_embeddings = model.encode(train_df["text"].tolist() , normalize_embeddings = True,
                               show_progress_bar = True)

validate_embedding = model.encode(validate_df["text"].tolist() ,  normalize_embeddings = True ,show_progress_bar = True)

Batches: 100%|█████████████████████████████████████████████████████████████████████████| 121/121 [04:42<00:00,  2.34s/it]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 31/31 [01:09<00:00,  2.25s/it]


In [12]:
emb_dimensions  = train_embeddings.shape[1]

index = faiss.IndexFlatIP(emb_dimensions)

index.add(train_embeddings)


In [13]:
def get_candidates(querry , k = 30):
    scores , indices = index.search (querry.reshape(1,-1) , k)
    return scores [0] ,indices[0]

In [14]:
train_rows = []
n = len(train_df)

for q_id in range(n):
    cand_ids, sims = get_candidates(train_embeddings[q_id])

    for c_id, sim in zip(cand_ids, sims):
        c_id = int(c_id)
        sim = float(sim)
        
        if c_id == q_id:
            continue

        train_rows.append({"query_id": q_id,"similarity": float(sim),

              "bayesain_rating": train_df.loc[c_id, "bayesain_rating"],
            "popularity": train_df.loc[c_id, "popularity"],
            "rating": train_df.loc[c_id, "rating_for_model"],
            "same_category": int(train_df.loc[c_id, "product_category"] ==  train_df.loc[q_id, "product_category"]),

            "label": float(sim) * train_df.loc[c_id, "bayesain_rating"]
        })

rank_df = pd.DataFrame(train_rows)

In [15]:
print(train_df.columns)


Index(['uniq_id', 'product_id', 'product_category', 'product_brand',
       'product_name', 'product_price', 'product_description',
       'product_currency', 'product_tags', 'product_reviews_count',
       'crawl_date', 'crawl_time', 'rating_for_model',
       'product_image_url_jpeg', 'popularity_score', 'brand_mean_rating',
       'category_mean_rating', 'crawl_month', 'crawl_dayofweek',
       'categorical_L1', 'categorical_L2', 'categorical_L3', 'categorical_L4',
       'text', 'bayesain_rating', 'popularity'],
      dtype='object')


In [16]:
features = ["similarity","bayesain_rating","popularity","rating","same_category"]


queries = rank_df["query_id"].unique()
train_q, val_q = train_test_split(queries, test_size=0.2, random_state=42)

train_rank = rank_df[rank_df["query_id"].isin(train_q)].copy()
val_rank   = rank_df[rank_df["query_id"].isin(val_q)].copy()

train_rank["label"] = train_rank["bayesain_rating"].round().astype(int)
val_rank["label"]   = val_rank["bayesain_rating"].round().astype(int)


x_train = train_rank[features]
y_train = train_rank["label"].round().astype(int)

x_val = val_rank[features]
y_val = val_rank["label"].round().astype(int)


group_train = train_rank.groupby("query_id").size().to_numpy()
group_val   = val_rank.groupby("query_id").size().to_numpy()

In [17]:
sum(group_train) == len(x_train)

np.True_

In [18]:
rank = lgb.LGBMRanker(objective = "lambdarank" , metric = "ndcg" , n_estimators = 1000 , learning_rate = 0.05 , num_leaves =31,
                       subsample = 0.8 , colsample_bytree = 0.8 , random_state = 42 , label_gain=np.linspace(0, 1, 10000))

rank.fit(x_train,y_train , group = group_train , eval_set =[(x_val , y_val)], eval_group =[group_val],
            eval_at = [5,10], callbacks = [lgb.early_stopping(stopping_rounds=30), 
                                              lgb.log_evaluation(period=50)])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 92160, number of used features: 5
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@5: 1	valid_0's ndcg@10: 1


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,'lambdarank'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [19]:

FEATURES = ["similarity", "bayesain_rating", "popularity", "rating_for_model", "same_category"]


def get_candidates(query_embedding, k=30):
    scores, indices = index.search(query_embedding.reshape(1, -1), k)
    return indices[0], scores[0]


def recommend_product(product_name, top_n=5):
    if product_name not in train_df["product_name"].values:
        print("Product not found! Try a valid product from training data.")
        return None
    

    id = train_df[train_df["product_name"] == product_name].index[0]
    

    cand_id, sims = get_candidates(train_embeddings[id])

    candidates = []
    features = []
    for c_id, sim in zip(cand_id, sims):
        if c_id == id:
            continue  
        candidates.append(c_id)
        features.append([
            sim,
            train_df.loc[c_id, "bayesain_rating"],
            train_df.loc[c_id, "popularity"],
            train_df.loc[c_id, "rating_for_model"],
            int(train_df.loc[c_id, "product_category"] == train_df.loc[id, "product_category"])
        ])
    
    features = pd.DataFrame(features, columns = FEATURES)
    
 
    scores = rank.predict(features)
    

    results = train_df.loc[candidates].copy()
    results["score"] = scores
    

    return results.sort_values("score", ascending=False).head(top_n)[
        ["product_name", "product_image_url_jpeg", "product_category", "rating_for_model", "product_reviews_count", "score"]
    ]


In [20]:
sample_product = "nars velvet matte lip pencil endangered red"  # replace with any product from your training set
top_recs = recommend_product(sample_product, top_n=5)
print(top_recs)


                                           product_name  \
2971                nars audacious lipstick liv 0 14 oz   
3104  nars lip liner 0 01 oz waimea nars velvet lip ...   
189              nars powermatte lip pigment give it up   
157              nabi matte long lasting lip gloss ruby   
194    l a colors auto lipliner pencil burgundy 0 01 oz   

                                 product_image_url_jpeg  \
2971  https://i5.walmartimages.com/asr/5b1ce1d4-b852...   
3104  https://i5.walmartimages.com/asr/c2144e00-60fd...   
189   https://i5.walmartimages.com/asr/6f22783d-bb9e...   
157   https://i5.walmartimages.com/asr/b6e40804-2c98...   
194   https://i5.walmartimages.com/asr/427a2cf0-c946...   

                                       product_category  rating_for_model  \
2971  premium beauty premium makeup premium lips pre...              5.00   
3104  premium beauty premium makeup premium lips pre...              4.29   
189   premium beauty premium makeup premium lips pre...    

## Saving model 

In [23]:
import os
print(os.getcwd())

D:\Machine_learning_projects\Product_recommendation_system


In [25]:
BASE_DIR = r"D:\Machine_learning_projects\Product_recommendation_system"
ARTIFACTS_DIR = os.path.join(BASE_DIR, "artifacts")

os.makedirs(ARTIFACTS_DIR, exist_ok=True)

print("Artifacts directory:", ARTIFACTS_DIR)

Artifacts directory: D:\Machine_learning_projects\Product_recommendation_system\artifacts


In [26]:
faiss.write_index(index , os.path.join(ARTIFACTS_DIR,"faiss.index"))

In [29]:
rank.booster_.save_model(os.path.join(ARTIFACTS_DIR,"lgbm_ranker.txt"))

<lightgbm.basic.Booster at 0x29349266e90>

In [30]:
import pickle

meta_bundle = {"train_df":train_df , "train_embedding":train_embeddings , "features" :FEATURES}

with open(os.path.join(ARTIFACTS_DIR , "recommendation.pkl"),"wb") as f :
    pickle.dump(meta_bundle,f,protocol = pickle.HIGHEST_PROTOCOL)
