In [1]:
import numpy as np
import pandas as pd

In [2]:
#Visualization Libraries

import matplotlib.pyplot as plt
import seaborn as sns

#Text Handling Libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [3]:
df = pd.read_csv('D:\DataSet Information\BigBasket Products.csv')

df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [4]:
df.shape

(27555, 10)

In [5]:
df = df.drop('index', axis = 1)

In [6]:
df.shape

(27555, 9)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   product       27554 non-null  object 
 1   category      27555 non-null  object 
 2   sub_category  27555 non-null  object 
 3   brand         27554 non-null  object 
 4   sale_price    27555 non-null  float64
 5   market_price  27555 non-null  float64
 6   type          27555 non-null  object 
 7   rating        18929 non-null  float64
 8   description   27440 non-null  object 
dtypes: float64(3), object(6)
memory usage: 1.9+ MB


In [8]:
df.columns

Index(['product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description'],
      dtype='object')

In [9]:
df.isnull().sum()

product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64

In [10]:
# Percentage Of Columns 

print('Percentage Null Data In Each Column')
print('-'*30)
for col in df.columns:
    null_count = df[col].isnull().sum()
    total_count = df.shape[0]
    print("{} : {:.2f}".format(col,null_count/total_count * 100))

Percentage Null Data In Each Column
------------------------------
product : 0.00
category : 0.00
sub_category : 0.00
brand : 0.00
sale_price : 0.00
market_price : 0.00
type : 0.00
rating : 31.30
description : 0.42


In [11]:
# Total Null Data 

print('Total Null Data')
null_count = df.isnull().sum().sum()
total_count = np.product(df.shape)
print("{:.2f}".format(null_count/total_count * 100))

Total Null Data
3.53


So overall 3% data is missing but 31% of ratings are missing. Since we are going to create a recommender system, 
let's drop the null values as their will still be over 69% data for recommendation purposes which is enough for us.

In [12]:
df = df.dropna()

In [13]:
df.isnull().sum()

product         0
category        0
sub_category    0
brand           0
sale_price      0
market_price    0
type            0
rating          0
description     0
dtype: int64

In [14]:
df.head()

Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [15]:
df.shape

(18840, 9)

In [16]:
df['product'].value_counts().head(10)

Turmeric Powder/Arisina Pudi          23
Soft Drink                            12
Cow Ghee/Tuppa                        11
Ghee/Tuppa                            11
Powder - Coriander                    10
Casting Creme Gloss Hair Color        10
Colorsilk Hair Colour With Keratin    10
Hand Sanitizer                         9
Anti Dandruff Shampoo                  9
Extra Virgin Olive Oil                 9
Name: product, dtype: int64

In [17]:
df['category'].value_counts().head(10)

Beauty & Hygiene            5460
Kitchen, Garden & Pets      2494
Snacks & Branded Foods      2468
Gourmet & World Food        2364
Foodgrains, Oil & Masala    2173
Cleaning & Household        2091
Bakery, Cakes & Dairy        665
Beverages                    630
Baby Care                    495
Name: category, dtype: int64

In [18]:
df['sub_category'].value_counts().head(10)

Skin Care                1641
Hair Care                 818
Bath & Hand Wash          808
Masalas & Spices          764
Storage & Accessories     658
Men's Grooming            649
Fragrances & Deos         627
Crockery & Cutlery        621
Ready To Cook & Eat       557
Organic Staples           550
Name: sub_category, dtype: int64

In [19]:
df['brand'].value_counts().head(10)

bb Royal          278
BB Home           172
Amul              153
Himalaya          139
Cello             104
BIOTIQUE          103
DP                101
Keya              101
Organic Tattva     99
MTR                97
Name: brand, dtype: int64

In [20]:
df.type.value_counts()

Face Care                 1094
Men's Deodorants           404
Shampoo & Conditioner      390
Blended Masalas            343
Containers Sets            332
                          ... 
Health Supplements           1
Bagels & Baguette            1
Gift Wraps & Bags            1
Gourmet Tea & Tea Bags       1
Gourmet Popcorn              1
Name: type, Length: 358, dtype: int64

## Content Based Filtering 

In [21]:
df.head()

Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [22]:
df.shape

(18840, 9)

Notice that a product can be in multiple catergories and sub_categories and they are separated with a &.
Let's split them into a list for futher processes

In [23]:
rmv_spc = lambda a:a.strip()

get_list = lambda a:list(map(rmv_spc,re.split('& |, |\*|\n', a)))

In [24]:
get_list

<function __main__.<lambda>(a)>

In [25]:
get_list('A & B, C')

['A', 'B', 'C']

In [26]:
for col in ['category', 'sub_category', 'type']:
    
    df[col] = df[col].apply(get_list)

In [27]:
df.head()

Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,Garlic Oil - Vegetarian Capsule 500 mg,"[Beauty, Hygiene]",[Hair Care],Sri Sri Ayurveda,220.0,220.0,"[Hair Oil, Serum]",4.1,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"[Kitchen, Garden, Pets]","[Storage, Accessories]",Mastercook,180.0,180.0,"[Water, Fridge Bottles]",2.3,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2","[Cleaning, Household]",[Pooja Needs],Trm,119.0,250.0,"[Lamp, Lamp Oil]",3.4,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,"[Cleaning, Household]","[Bins, Bathroom Ware]",Nakoda,149.0,176.0,"[Laundry, Storage Baskets]",3.7,Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,"[Beauty, Hygiene]","[Bath, Hand Wash]",Nivea,162.0,162.0,"[Bathing Bars, Soaps]",4.4,Nivea Creme Soft Soap gives your skin the best...


To avoid duplicacy, we will be converting everything to lowercase and also removing spaces between words.
This will ensure that our recommendor doesn't consider Chocolate of Cholocate IceCream and Chocolate Bar as the same.

In [28]:
def cleaner(x):
    
    if isinstance(x, list):
        
        return [str.lower(i.replace(" ", "")) for i in x]
    
    else:
        
        if isinstance(x, str):
            
            return str.lower(x.replace(" ", ""))
        
        else:
            
            return ''
        

In [29]:
df.head(4)

Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,Garlic Oil - Vegetarian Capsule 500 mg,"[Beauty, Hygiene]",[Hair Care],Sri Sri Ayurveda,220.0,220.0,"[Hair Oil, Serum]",4.1,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"[Kitchen, Garden, Pets]","[Storage, Accessories]",Mastercook,180.0,180.0,"[Water, Fridge Bottles]",2.3,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2","[Cleaning, Household]",[Pooja Needs],Trm,119.0,250.0,"[Lamp, Lamp Oil]",3.4,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,"[Cleaning, Household]","[Bins, Bathroom Ware]",Nakoda,149.0,176.0,"[Laundry, Storage Baskets]",3.7,Multipurpose container with an attractive desi...


 We will now be joining the values of category, sub_category, type and brand

In [30]:
def merge(x):
    
    return ' '.join(x['category']) + ' ' + ' '.join(x['sub_category']) + ' '+x['brand']+' ' +' '.join( x['type'])

df['merging'] = df.apply(merge, axis=1)

In [31]:
df.merging.head(5)

0    Beauty Hygiene Hair Care Sri Sri Ayurveda  Hai...
1    Kitchen Garden Pets Storage Accessories Master...
2     Cleaning Household Pooja Needs Trm Lamp Lamp Oil
3    Cleaning Household Bins Bathroom Ware Nakoda L...
4    Beauty Hygiene Bath Hand Wash Nivea Bathing Ba...
Name: merging, dtype: object

We need to Count the String Vectors and then compute the Cosine Similarity Score.

In [32]:
df.head()

Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,type,rating,description,merging
0,Garlic Oil - Vegetarian Capsule 500 mg,"[Beauty, Hygiene]",[Hair Care],Sri Sri Ayurveda,220.0,220.0,"[Hair Oil, Serum]",4.1,This Product contains Garlic Oil that is known...,Beauty Hygiene Hair Care Sri Sri Ayurveda Hai...
1,Water Bottle - Orange,"[Kitchen, Garden, Pets]","[Storage, Accessories]",Mastercook,180.0,180.0,"[Water, Fridge Bottles]",2.3,"Each product is microwave safe (without lid), ...",Kitchen Garden Pets Storage Accessories Master...
2,"Brass Angle Deep - Plain, No.2","[Cleaning, Household]",[Pooja Needs],Trm,119.0,250.0,"[Lamp, Lamp Oil]",3.4,"A perfect gift for all occasions, be it your m...",Cleaning Household Pooja Needs Trm Lamp Lamp Oil
3,Cereal Flip Lid Container/Storage Jar - Assort...,"[Cleaning, Household]","[Bins, Bathroom Ware]",Nakoda,149.0,176.0,"[Laundry, Storage Baskets]",3.7,Multipurpose container with an attractive desi...,Cleaning Household Bins Bathroom Ware Nakoda L...
4,Creme Soft Soap - For Hands & Body,"[Beauty, Hygiene]","[Bath, Hand Wash]",Nivea,162.0,162.0,"[Bathing Bars, Soaps]",4.4,Nivea Creme Soft Soap gives your skin the best...,Beauty Hygiene Bath Hand Wash Nivea Bathing Ba...


In [33]:
df.to_csv("final_df.csv",index=False)

In [34]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['merging'])

We need to Count the String Vectors and then compute the Cosine Similarity Score.

In [35]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

cosine_sim

array([[1.        , 0.        , 0.08451543, ..., 0.        , 0.        ,
        0.16116459],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.08451543, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.16116459, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [36]:
import pickle

# Assuming you have already calculated the cosine similarity and stored it in the cosine_sim variable.

# Save cosine_sim as a pickle file
with open('cosine_sim.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)

In [37]:
df = df.reset_index()

indices = pd.Series(df.index, index=df['product'])

In [38]:
def get_recommendations(title, cosine_sim = cosine_sim):
    
    idx = indices[title]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    similarity_indices = [i[0] for i in sim_scores]

    return df['product'].iloc[similarity_indices].tolist()

In [39]:
recommendation = get_recommendations('Water Bottle - Orange', cosine_sim)

In [40]:
recommendation

['Glass Water Bottle - Aquaria Organic Purple',
 'H2O Unbreakable Water Bottle - Pink',
 'Water Bottle H2O Purple',
 'H2O Unbreakable Water Bottle - Green',
 'Regel Tritan Plastic Sports Water Bottle - Black',
 'Glass Water Bottle With Maroon Cap - BB1245MRN',
 'Loopy Pet water Bottle - Violet',
 'Ivory Premium Glass Bottle - With Yellow Floral',
 'Double Walled Glass Bottle With Cream Cap - BB1241CRM',
 'Water Bottle - Insulated, Assorted Colour, Stainless Steel']

In [41]:
 get_recommendations('Cadbury Perk - Chocolate Bar', cosine_sim)

['Nutties Chocolate Pack',
 '5 Star Chocolate Bar',
 'Dairy Milk Silk - Hazelnut Chocolate Bar',
 'Perk - Chocolate, Home Treats, 175.5 g, 27 Units',
 'Dark Milk Chocolate Bar',
 'Dairy Milk Silk Mousse - Chocolate Bar',
 'Dark Milk Chocolate Bar',
 'Chocolate Bar - Fuse',
 'Choclairs Gold Coffee',
 '5 Star Chocolate Home Pack, 200 g, 20 units']

In [42]:
get_recommendations('Hand Sanitizer - 70% Alcohol Base', cosine_sim)

['The Hand Sanitizer With Moisturisers',
 'Skincare Hand Wash Refill',
 'Wipeout Germ Killing Soap',
 'Classic Sandalwood & Tulsi Hand Wash',
 'Hand Sanitizer',
 'Antibacterial Germ Kill Spray (No Gas) – Safe On Skin, Safe On Surfaces',
 'Safe Handz Hand Wash - Lemon & Glycerine With Coconut Milk',
 'Hand Wash - Orange Peel',
 'Advanced Hand Sanitizer',
 'Germ Protection Hand Wash Pump']

In [43]:
get_recommendations('Germ - Removal Multipurpose Wipes', cosine_sim)

['Disinfectant Surface Cleaning Spray',
 'Germ Kill Spray',
 'Disinfectant Surface Sanitizer',
 'Veggie Wash',
 'Vegetable & Fruit Wash Spray with 100% Natural Action',
 'Gadget Disinfectant - For Mobiles & Laptops',
 'Eco Multipurpose Sanitising Wipes - Fresh Lemon',
 'Clothes Disinfectant & Refreshing Spray',
 'Veg Wash Prime - Disinfectant Pre-Wash For Whole Vegetables & Fruits',
 'Multi-Surface Wipes']