## Import libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import os
from scipy.sparse import coo_matrix

## Read Dataset

In [78]:
#read dataset
df = pd.read_csv('amazon.csv')
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

In [116]:
train_data = df[['product_id','product_name', 'category','actual_price', 'rating', 'rating_count', 'about_product', 'img_link', 'product_link']]
train_data.head(3)

Unnamed: 0,product_id,product_name,category,actual_price,rating,rating_count,about_product,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,"₹1,099",4.2,24269,High Compatibility : Compatible With iPhone 12...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,₹349,4.0,43994,"Compatible with all Type C enabled devices, be...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,"₹1,899",3.9,7928,【 Fast Charger& Data Sync】-With built-in safet...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...


In [132]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_id     1465 non-null   object
 1   product_name   1465 non-null   object
 2   category       1465 non-null   object
 3   actual_price   1465 non-null   object
 4   rating         1465 non-null   object
 5   rating_count   1465 non-null   object
 6   about_product  1465 non-null   object
 7   img_link       1465 non-null   object
 8   product_link   1465 non-null   object
dtypes: object(9)
memory usage: 103.1+ KB


In [120]:
#display the shape of the data
train_data.shape

(1465, 9)

In [122]:
#describe the data
train_data.describe().T

Unnamed: 0,count,unique,top,freq
product_id,1465,1351,B07JW9H4J1,3
product_name,1465,1337,"Fire-Boltt Ninja Call Pro Plus 1.83"" Smart Wat...",5
category,1465,211,Computers&Accessories|Accessories&Peripherals|...,233
actual_price,1465,449,₹999,120
rating,1465,28,4.1,244
rating_count,1463,1143,9378,9
about_product,1465,1293,[CHARGE & SYNC FUNCTION]- This cable comes wit...,6
img_link,1465,1412,https://m.media-amazon.com/images/I/413sCRKobN...,3
product_link,1465,1465,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,1


# Data Cleaning

## Handling Missing values

In [124]:
#check for any null values
train_data.isnull().sum()

product_id       0
product_name     0
category         0
actual_price     0
rating           0
rating_count     2
about_product    0
img_link         0
product_link     0
dtype: int64

In [126]:
# Fill missing values in 'Product Rating' with a default value (e.g., 0)
train_data['rating_count'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['rating_count'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['rating_count'].fillna(0, inplace=True)


In [128]:
train_data.isnull().sum()

product_id       0
product_name     0
category         0
actual_price     0
rating           0
rating_count     0
about_product    0
img_link         0
product_link     0
dtype: int64

In [135]:
#check if duplicates have
train_data.duplicated().sum()

0

## Correct Data Types

In [170]:
#Product price should be float.
train_data['actual_price']= train_data['actual_price'].replace( {'\₹': '' , ',': ''}, regex=True).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['actual_price']= train_data['actual_price'].replace( {'\₹': '' , ',': ''}, regex=True).astype(float)


In [172]:
#check all the unique values in the rating column before converting it to a float
sorted(df['rating'].unique(), reverse=False)

['2',
 '2.3',
 '2.6',
 '2.8',
 '2.9',
 '3',
 '3.0',
 '3.1',
 '3.2',
 '3.3',
 '3.4',
 '3.5',
 '3.6',
 '3.7',
 '3.8',
 '3.9',
 '4',
 '4.0',
 '4.1',
 '4.2',
 '4.3',
 '4.4',
 '4.5',
 '4.6',
 '4.7',
 '4.8',
 '5.0',
 '|']

In [174]:
#rating column has special character '|'  which needs to be removed or replaced

#check which row has special character
special_character = train_data[train_data['rating'] == '|']

# replace the special character with NaN
train_data['rating'] = train_data['rating'].replace('|', None)

# Convert the 'rating' column to numeric (this will convert invalid entries to NaN)
train_data['rating'] = pd.to_numeric(train_data['rating'], errors='coerce')

#Fill missing values (NaN) with the average rating
avg_rating = train_data['rating'].mean()
train_data['rating'] = train_data['rating'].fillna(avg_rating)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['rating'] = train_data['rating'].replace('|', None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['rating'] = pd.to_numeric(train_data['rating'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['rating'] = train_data['rating'].fillna(avg_rating)


In [178]:
#convert rating_count colmn to float
train_data['rating_count'] = pd.to_numeric(train_data['rating_count'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['rating_count'] = pd.to_numeric(train_data['rating_count'], errors='coerce')


In [180]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     1465 non-null   object 
 1   product_name   1465 non-null   object 
 2   category       1465 non-null   object 
 3   actual_price   1465 non-null   float64
 4   rating         1465 non-null   float64
 5   rating_count   328 non-null    float64
 6   about_product  1465 non-null   object 
 7   img_link       1465 non-null   object 
 8   product_link   1465 non-null   object 
dtypes: float64(3), object(6)
memory usage: 103.1+ KB
