In [5]:
import pandas as pd, joblib, os
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
RAW = "BigBasket Products.csv"
SAVE_DIR = "artefacts"
os.makedirs(SAVE_DIR, exist_ok=True)

In [9]:
df = pd.read_csv(RAW)
df

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.00,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.00,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.00,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.00,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.00,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...
...,...,...,...,...,...,...,...,...,...,...
27550,27551,"Wottagirl! Perfume Spray - Heaven, Classic",Beauty & Hygiene,Fragrances & Deos,Layerr,199.20,249.0,Perfume,3.9,Layerr brings you Wottagirl Classic fragrant b...
27551,27552,Rosemary,Gourmet & World Food,Cooking & Baking Needs,Puramate,67.50,75.0,"Herbs, Seasonings & Rubs",4.0,Puramate rosemary is enough to transform a dis...
27552,27553,Peri-Peri Sweet Potato Chips,Gourmet & World Food,"Snacks, Dry Fruits, Nuts",FabBox,200.00,200.0,Nachos & Chips,3.8,We have taken the richness of Sweet Potatoes (...
27553,27554,Green Tea - Pure Original,Beverages,Tea,Tetley,396.00,495.0,Tea Bags,4.2,"Tetley Green Tea with its refreshing pure, ori..."


In [14]:
keep_cols = ["product", "brand", "category", "sub_category", "description"]
df = df[keep_cols].dropna(subset=["product", "description"])       
df = df.drop_duplicates(subset=["product", "description"])

In [13]:
print(df.columns.tolist())

['index', 'product', 'category', 'sub_category', 'brand', 'sale_price', 'market_price', 'type', 'rating', 'description']


In [16]:
df["text"] = (
    df["product"].fillna("")        + " "
    + df["brand"].fillna("")        + " "
    + df["category"].fillna("")     + " "
    + df["sub_category"].fillna("") + " "
    + df["description"].fillna("")
)

In [17]:
vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1, 2),
    max_features=50_000,           
    lowercase=True,
    min_df=2
)

In [18]:
tfidf_matrix = vectorizer.fit_transform(df["text"])

In [19]:
df.to_csv(os.path.join(SAVE_DIR, "products_clean.csv"), index=False)
joblib.dump(vectorizer,  os.path.join(SAVE_DIR, "vectorizer.joblib"))
joblib.dump(tfidf_matrix, os.path.join(SAVE_DIR, "tfidf_matrix.joblib"))

['artefacts/tfidf_matrix.joblib']

In [None]:
print("Prepared data & saved artefacts to", SAVE_DIR)