## Rating base recommendation system

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download("punkt")
nltk.download("stopwords")
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [104]:
#read dataset
df = pd.read_csv(r'C:\Users\DELL\Desktop\cleaned_train_data.csv')
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'rating', 'rating_count', 'about_product'],
      dtype='object')

In [106]:
average_ratings = df.groupby(['product_name','rating_count','category'])['rating'].mean().reset_index()

In [108]:
top_rated_items = average_ratings.sort_values(by='rating', ascending=False)

rating_base_recommendation = top_rated_items.head(10)

In [110]:
rating_base_recommendation.loc[:,'rating'] = rating_base_recommendation['rating'].astype(int)
rating_base_recommendation.loc[:,'rating_count'] = rating_base_recommendation['rating_count'].astype(int)

In [114]:
print("Rating Base Recommendation System: (Trending Products)")
rating_base_recommendation.loc[:, ['product_name','rating','rating_count','category']] = rating_base_recommendation[['product_name','rating','rating_count','category']]
rating_base_recommendation

Rating Base Recommendation System: (Trending Products)


Unnamed: 0,product_name,rating_count,category,rating
880,"REDTECH USB-C to Lightning Cable 3.3FT, [Apple...",0.0,Computers&Accessories|Accessories&Peripherals|...,5.0
1055,Syncwire LTG to USB Cable for Fast Charging Co...,5.0,Computers&Accessories|Accessories&Peripherals|...,5.0
85,Amazon Basics Wireless Mouse | 2.4 GHz Connect...,23.0,Computers&Accessories|Accessories&Peripherals|...,5.0
751,"Oratech Coffee Frother electric, milk frother ...",28.0,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,4.0
1053,Swiffer Instant Electric Water Heater Faucet T...,53803.0,"Home&Kitchen|Heating,Cooling&AirQuality|WaterH...",4.0
511,"Instant Pot Air Fryer, Vortex 2QT, Touch Contr...",3964.0,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,4.0
368,"FIGMENT Handheld Milk Frother Rechargeable, 3-...",1729.0,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,4.0
241,Campfire Spring Chef Prolix Instant Portable W...,2591.0,"Home&Kitchen|Heating,Cooling&AirQuality|WaterH...",4.0
1023,Sony Bravia 164 cm (65 inches) 4K Ultra HD Sma...,5935.0,"Electronics|HomeTheater,TV&Video|Televisions|S...",4.0
694,Multifunctional 2 in 1 Electric Egg Boiling St...,2300.0,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,4.0


## Content base recommendation system

In [116]:
print(df.columns)


Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'rating', 'rating_count', 'about_product'],
      dtype='object')


In [118]:
print(df.head())


   product_id                                       product_name  \
0           0  D-Link DWA-131 300 Mbps Wireless Nano USB Adap...   
1           1  TP-Link Nano USB WiFi Dongle 150Mbps High Gain...   
2           2  Duracell Plus AAA Rechargeable Batteries (750 ...   
3           3  Logitech B100 Wired USB Mouse, 3 yr Warranty, ...   
4           4  Logitech M235 Wireless Mouse, 1000 DPI Optical...   

                                            category  discounted_price  \
0  Computers&Accessories|NetworkingDevices|Networ...             507.0   
1  Computers&Accessories|NetworkingDevices|Networ...             749.0   
2  Electronics|GeneralPurposeBatteries&BatteryCha...             399.0   
3  Computers&Accessories|Accessories&Peripherals|...             279.0   
4  Computers&Accessories|Accessories&Peripherals|...             699.0   

   actual_price  rating  rating_count  \
0        1208.0     4.1        8131.0   
1        1339.0     4.2      179692.0   
2         499.0     4.3

In [120]:
df_content = df[[ 'product_name','category', 'about_product', 'product_id', 'rating']]

In [124]:
# Update 'product_search_name' and 'category' using .loc[]
df_content.loc[:, 'product_search_name'] = df_content['category'].apply(
    lambda x: x.split("|")[-1] if x else x  # Simplified check for empty string
)

df_content.loc[:, 'category'] = df_content['category'].apply(
    lambda x: " ".join(x.split("|")[:2]) if x else x  # Simplified check for empty string
)

In [126]:
print(f"Original string present in the category column is:\n {df['category'][0]}")
print("-" *20)
print(f" The extracted category is:\n {df_content['category'][0]}")
print("-" *20)
print(f"The extracted product search name is:\n {df_content['product_search_name'][0]}")

Original string present in the category column is:
 Computers&Accessories|NetworkingDevices|NetworkAdapters|WirelessUSBAdapters
--------------------
 The extracted category is:
 Computers&Accessories NetworkingDevices
--------------------
The extracted product search name is:
 Computers&Accessories NetworkingDevices


In [128]:
df_content.loc[:, 'product_search_name'] = df_content['product_search_name'].str.lower()

In [132]:
df_content.loc[:, 'description'] = (
    df_content['category'].fillna('') + 
    df_content['about_product'].fillna('') + 
    df_content['product_name'].fillna('')
)


In [134]:
df_content.head(2)

Unnamed: 0,product_name,category,about_product,product_id,rating,product_search_name,description
0,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Computers&Accessories NetworkingDevices,Connects your computer to a high-speed wireles...,0,4.1,computers&accessories networkingdevices,Computers&Accessories NetworkingDevicesConnect...
1,TP-Link Nano USB WiFi Dongle 150Mbps High Gain...,Computers&Accessories NetworkingDevices,150 Mbps Wi-Fi —— Exceptional wireless speed u...,1,4.2,computers&accessories networkingdevices,Computers&Accessories NetworkingDevices150 Mbp...


In [136]:
df_content['description'][2]

"Electronics GeneralPurposeBatteries&BatteryChargersDuracell Rechargeable AAA 750mAh batteries stay charged for up to 12 months, when not in use|Unused Duracell Rechargeable AAA 750mAh batteries are guaranteed to last 3 years|Duracell Rechargeable AAA 750mAh batteries come pre-charged, ready to use|Duracell Rechargeable AAA 750mAh batteries can be recharged 100's times|Suited for regularly used devices like wireless mouse or babyphone|They are available in AA 1300mAh and AAA 750mAh sizes|They work in any NiMH chargerDuracell Plus AAA Rechargeable Batteries (750 mAh) Pack of 4"

In [138]:
#Covert to lower case
df_content.loc[:, 'description'] = df_content['description'].str.lower()

In [140]:
def stem(text):
    if isinstance(text, str):  # Check if text is a string
        y = []
        for i in text.split():
            y.append(ps.stem(i))  # Assuming ps is your stemmer
        return " ".join(y)
    return text  # Return the original value if it's not a string


In [142]:
df_content.loc[:, 'description'] = df_content['description'].apply(stem)

In [143]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

df_content.loc[:, 'description'] = df_content['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_content['description']).toarray()

In [144]:
#Output the shape of tfidf_matrix
tfidf_matrix.shape

(1351, 10951)

In [148]:
tfidf_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [150]:
similarity = cosine_similarity(tfidf_matrix)
similarity

array([[1.        , 0.26969795, 0.01523116, ..., 0.0028744 , 0.02716936,
        0.00710969],
       [0.26969795, 1.        , 0.00969605, ..., 0.00939581, 0.04106641,
        0.01133409],
       [0.01523116, 0.00969605, 1.        , ..., 0.02104322, 0.01651457,
        0.00373394],
       ...,
       [0.0028744 , 0.00939581, 0.02104322, ..., 1.        , 0.00511526,
        0.04263457],
       [0.02716936, 0.04106641, 0.01651457, ..., 0.00511526, 1.        ,
        0.01259805],
       [0.00710969, 0.01133409, 0.00373394, ..., 0.04263457, 0.01259805,
        1.        ]])

In [152]:
sorted(list(enumerate(similarity[0])), reverse = True, key = lambda x: x[1])[:5]

[(0, 0.9999999999999998),
 (370, 0.31064184588910804),
 (387, 0.2905280941918484),
 (320, 0.28777153806642775),
 (444, 0.28076871309863394)]

In [154]:
def recommend(name):
    matched_products = df_content[df_content['product_search_name'] == name]
    if matched_products.empty:
        print(f"No matches found for '{name}'.")
        return None  # or handle accordingly
    
    product_index = matched_products.index[0]
    distances = similarity[product_index]
    product_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    return product_list
