# Import libraries

In [1325]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix

# Read Dataset

In [1328]:
#read dataset
df = pd.read_csv('amazon.csv')
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [1330]:
df.head()

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,₹399,"₹1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,₹199,₹349,43%,4.0,43994,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","ArdKn,Nirbhay kumar,Sagar Viswanathan,Asp,Plac...","RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,₹199,"₹1,899",90%,3.9,7928,【 Fast Charger& Data Sync】-With built-in safet...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Kunal,Himanshu,viswanath,sai niharka,saqib mal...","R3J3EQQ9TZI5ZJ,R3E7WBGK7ID0KV,RWU79XKQ6I1QF,R2...","Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,₹329,₹699,53%,4.2,94363,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Omkar dhale,JD,HEMALATHA,Ajwadh a.,amar singh ...","R3EEUZKKK9J36I,R3HJVYCLYOY554,REDECAZ7AMPQC,R1...","Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou...",https://m.media-amazon.com/images/I/41V5FtEWPk...,https://www.amazon.in/Deuce-300-Resistant-Tang...
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,₹154,₹399,61%,4.2,16905,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","rahuls6099,Swasat Borah,Ajay Wadke,Pranali,RVK...","R1BP4L2HH9TFUP,R16PVJEXKV6QZS,R2UPDB81N66T4P,R...","As good as original,Decent,Good one for second...","Bought this instead of original apple, does th...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Portronics-Konnect-POR-1...


In [1332]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

In [1334]:
#display the shape of the data
df.shape

(1465, 16)

In [1336]:
#describe the data
df.describe().T

Unnamed: 0,count,unique,top,freq
product_id,1465,1351,B07JW9H4J1,3
product_name,1465,1337,"Fire-Boltt Ninja Call Pro Plus 1.83"" Smart Wat...",5
category,1465,211,Computers&Accessories|Accessories&Peripherals|...,233
discounted_price,1465,550,₹199,53
actual_price,1465,449,₹999,120
discount_percentage,1465,92,50%,56
rating,1465,28,4.1,244
rating_count,1463,1143,9378,9
about_product,1465,1293,[CHARGE & SYNC FUNCTION]- This cable comes wit...,6
user_id,1465,1194,"AHIKJUDTVJ4T6DV6IUGFYZ5LXMPA,AE55KTFVNXYFD5FPY...",10


# Data Cleaning and Preprocessing

In [1339]:
#drop unnecessary columns
train_data = df.drop(['discount_percentage', 'user_name', 'review_id', 'review_title', 'review_content', 'product_link'],axis=1)
train_data

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,rating,rating_count,about_product,user_id,img_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,₹399,"₹1,099",4.2,24269,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...",https://m.media-amazon.com/images/W/WEBP_40237...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,₹199,₹349,4.0,43994,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...",https://m.media-amazon.com/images/W/WEBP_40237...
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,₹199,"₹1,899",3.9,7928,【 Fast Charger& Data Sync】-With built-in safet...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...",https://m.media-amazon.com/images/W/WEBP_40237...
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,₹329,₹699,4.2,94363,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...",https://m.media-amazon.com/images/I/41V5FtEWPk...
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,₹154,₹399,4.2,16905,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...",https://m.media-amazon.com/images/W/WEBP_40237...
...,...,...,...,...,...,...,...,...,...,...
1460,B08L7J3T31,Noir Aqua - 5pcs PP Spun Filter + 1 Spanner | ...,Home&Kitchen|Kitchen&HomeAppliances|WaterPurif...,₹379,₹919,4,1090,SUPREME QUALITY 90 GRAM 3 LAYER THIK PP SPUN F...,"AHITFY6AHALOFOHOZEOC6XBP4FEA,AFRABBODZJZQB6Z4U...",https://m.media-amazon.com/images/I/41fDdRtjfx...
1461,B01M6453MB,Prestige Delight PRWO Electric Rice Cooker (1 ...,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,"₹2,280","₹3,045",4.1,4118,"230 Volts, 400 watts, 1 Year","AFG5FM3NEMOL6BNFRV2NK5FNJCHQ,AGEINTRN6Z563RMLH...",https://m.media-amazon.com/images/I/41gzDxk4+k...
1462,B009P2LIL4,Bajaj Majesty RX10 2000 Watts Heat Convector R...,"Home&Kitchen|Heating,Cooling&AirQuality|RoomHe...","₹2,219","₹3,080",3.6,468,International design and styling|Two heat sett...,"AGVPWCMAHYQWJOQKMUJN4DW3KM5Q,AF4Q3E66MY4SR7YQZ...",https://m.media-amazon.com/images/W/WEBP_40237...
1463,B00J5DYCCA,Havells Ventil Air DSP 230mm Exhaust Fan (Pist...,"Home&Kitchen|Heating,Cooling&AirQuality|Fans|E...","₹1,399","₹1,890",4,8031,Fan sweep area: 230 MM ; Noise level: (40 - 45...,"AF2JQCLSCY3QJATWUNNHUSVUPNQQ,AFDMLUXC5LS5RXDJS...",https://m.media-amazon.com/images/W/WEBP_40237...


## Data Transformation

### Encode Product ID column

In [1387]:
categorical = ['product_id']

#display categorical column
le = LabelEncoder()
train_data[categorical].describe()

Unnamed: 0,product_id
count,1351.0
mean,675.0
std,390.144418
min,0.0
25%,337.5
50%,675.0
75%,1012.5
max,1350.0


## Data Formatting

### Convert String type Numerical Data columns to float

In [1389]:
#Product price and discount price should be float.
train_data['actual_price'] = train_data['actual_price'].replace({'₹': '', ',': ''}, regex=True).astype(float)

train_data['discounted_price'] = train_data['discounted_price'].replace({'₹': '', ',': ''}, regex=True).astype(float)

In [1349]:
#check all the unique values in the rating column before converting it to a float
sorted(train_data['rating'].unique(), reverse=False)

['2',
 '2.3',
 '2.6',
 '2.8',
 '2.9',
 '3',
 '3.0',
 '3.1',
 '3.2',
 '3.3',
 '3.4',
 '3.5',
 '3.6',
 '3.7',
 '3.8',
 '3.9',
 '4',
 '4.0',
 '4.1',
 '4.2',
 '4.3',
 '4.4',
 '4.5',
 '4.6',
 '4.7',
 '4.8',
 '5.0',
 '|']

In [1351]:
#rating column has special character '|'  which needs to be removed or replaced

#check which row has special character
special_character = train_data[train_data['rating'] == '|']

# replace the special character with NaN
train_data['rating'] = train_data['rating'].replace('|', None)

# Convert the 'rating' column to numeric (this will convert invalid entries to NaN)
train_data['rating'] = pd.to_numeric(train_data['rating'], errors='coerce')

#Fill missing values (NaN) with the average rating
avg_rating = train_data['rating'].mean()
train_data['rating'] = train_data['rating'].fillna(avg_rating)

In [1353]:
# Convert rating_count column to float by removing commas
train_data['rating_count'] = train_data['rating_count'].str.replace(',', '', regex=True).astype(float)

In [1355]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   product_id        1465 non-null   int32  
 1   product_name      1465 non-null   object 
 2   category          1465 non-null   object 
 3   discounted_price  1465 non-null   float64
 4   actual_price      1465 non-null   float64
 5   rating            1465 non-null   float64
 6   rating_count      1463 non-null   float64
 7   about_product     1465 non-null   object 
 8   user_id           1465 non-null   object 
 9   img_link          1465 non-null   object 
dtypes: float64(4), int32(1), object(5)
memory usage: 108.9+ KB


## Handling Missing values

In [1358]:
#check for any null values
train_data.isnull().sum()

product_id          0
product_name        0
category            0
discounted_price    0
actual_price        0
rating              0
rating_count        2
about_product       0
user_id             0
img_link            0
dtype: int64

In [1360]:
# Fill missing values in 'Product Rating' with a default value (e.g., 0)
train_data['rating_count'] = train_data['rating_count'].fillna(0)

In [1362]:
train_data.isnull().sum()

product_id          0
product_name        0
category            0
discounted_price    0
actual_price        0
rating              0
rating_count        0
about_product       0
user_id             0
img_link            0
dtype: int64

## Handling duplicate Values

In [1365]:
#check if duplicates have
train_data.duplicated().sum()

19

In [1368]:
# if there are any duplicates in product_id
train_data['product_id'].duplicated().any()

True

In [1370]:
# "True" means some rows share same product ids
duplicate_rows = train_data[train_data['product_id'].duplicated(keep=False)]
print(duplicate_rows['product_id'] )

0        346
1        848
2        819
3        643
4        588
        ... 
1007    1262
1010     319
1017    1261
1018    1329
1019     133
Name: product_id, Length: 206, dtype: int32


In [1376]:
# Combine duplicates
train_data = train_data.groupby('product_id').agg({
    'product_name': 'first', # use the first product name
    'category': 'first', # use the first category
    'discounted_price': 'first', # use the first discounted_price
    'actual_price': 'first', # use the first actual_price
    'rating': 'mean',  # take the average rating
    'rating_count': 'mean' , # take the average rating_count
    'about_product': 'first', # use the first about_product
}).reset_index()

# Display the updated DataFrame
print(train_data)

      product_id                                       product_name  \
0              0  D-Link DWA-131 300 Mbps Wireless Nano USB Adap...   
1              1  TP-Link Nano USB WiFi Dongle 150Mbps High Gain...   
2              2  Duracell Plus AAA Rechargeable Batteries (750 ...   
3              3  Logitech B100 Wired USB Mouse, 3 yr Warranty, ...   
4              4  Logitech M235 Wireless Mouse, 1000 DPI Optical...   
...          ...                                                ...   
1346        1346  WIDEWINGS Electric Handheld Milk Wand Mixer Fr...   
1347        1347  Khaitan ORFin Fan heater for Home and kitchen-...   
1348        1348  Oratech Coffee Frother electric, milk frother ...   
1349        1349  REDTECH USB-C to Lightning Cable 3.3FT, [Apple...   
1350        1350  Swiffer Instant Electric Water Heater Faucet T...   

                                               category  discounted_price  \
0     Computers&Accessories|NetworkingDevices|Networ...             50

In [1391]:
# check now any duplicates have
train_data.duplicated().sum()

0

In [1380]:
# Specify the file path where you want to save the cleaned data
output_file_path = 'cleaned_train_data.csv'

# Save the cleaned DataFrame to a CSV file
train_data.to_csv(output_file_path, index=False)