# Amazon - Ratings (Beauty Products)
__Author__ : Mohammad Rouintan , 400222042

__Course__ : Undergraduate Data Science Course

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.decomposition import TruncatedSVD
from surprise import Dataset,SVD , Reader, model_selection
from surprise.model_selection import cross_validate, KFold

In [2]:
ratings = pd.read_csv('/kaggle/input/amazon-ratings/ratings_Beauty.csv')
ratings.dropna(inplace=True)
ratings.drop_duplicates(inplace=True)
ratings.shape

(2023070, 4)

In [3]:
ratings.head()

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,558925278,3.0,1355443200
2,A1Z513UWSAAO0F,558925278,5.0,1404691200
3,A1WMRR494NWEWV,733001998,4.0,1382572800
4,A3IAAVS479H7M7,737104473,1.0,1274227200


In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 61.7+ MB


In [5]:
ratings.columns

Index(['UserId', 'ProductId', 'Rating', 'Timestamp'], dtype='object')

In [6]:
ratings.nunique()

UserId       1210271
ProductId     249274
Rating             5
Timestamp       4231
dtype: int64

## Recommendation System
### Model-based collaborative filtering system


In [7]:
ratings_sample = ratings.sample(25_000, random_state=42)

In [8]:
ratings_matrix = ratings_sample.pivot_table(values='Rating', index='UserId', columns='ProductId', fill_value=0)
ratings_matrix.head()

ProductId,1304511081,3227001381,7535842801,832900094X,9571044822,9602617918,9746427962,9748776093,9759091062,9788071198,...,B00KWFDBKE,B00L0C529Q,B00L2KVF9W,B00L31Z15E,B00L3K91OW,B00L5JHZJO,B00L5KTZ0K,B00LBEXC3Q,B00LJEACWC,B00LLPT4HI
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0010876CNE3ILIM9HV0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A004205218STRNUW6PPPA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00473363TJ8YSZ3YAGG9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A00540051HYJDHVJB2FRO,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0109071NOSV8FSC7VNY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
ratings_matrix.shape

(24442, 17434)

In [10]:
X = ratings_matrix.T
X.head()

UserId,A0010876CNE3ILIM9HV0,A004205218STRNUW6PPPA,A00473363TJ8YSZ3YAGG9,A00540051HYJDHVJB2FRO,A0109071NOSV8FSC7VNY,A0116899HIQEDWSBJJG9,A01198201H0E3GHV2Z17I,A0126073315N22OLC73MZ,A01288351ESHZ2KNAXBJ7,A0139874ED7NYUB55TSR,...,AZXC4VH7CHFXR,AZXJ9RA2EQK1E,AZYLNZLSXSK9D,AZYM7TVEU0NM,AZYP4WUX6VHRH,AZYPAWSYSCISH,AZYUPMWCE8JBV,AZYZPOORIDNLR,AZZ24BJWAFFES,AZZK6830KW86T
ProductId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1304511081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3227001381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7535842801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
832900094X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9571044822,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
X.shape

(17434, 24442)

### Decomposing the Matrix

In [12]:
SVD = TruncatedSVD(n_components=20)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

(17434, 20)

### Correlation Matrix

In [13]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(17434, 17434)

B0000AOWL4Assuming the customer buys Product ID # B0000AOWL4 (randomly chosen)

In [14]:
X.index[250]

'B0000AOWL4'

In [15]:
i = "B0000AOWL4"

product_names = list(X.index)
product_ID = product_names.index(i)
product_ID

250

In [16]:
correlation_product_ID = correlation_matrix[product_ID]
correlation_product_ID.shape

(17434,)

### Recommending top 20 highly correlated products in sequence

In [17]:
Recommended = list(X.index[correlation_product_ID > 0.90])
Recommended.remove(i) 
Recommended[0:20]

['B00006FE30',
 'B00017XN7E',
 'B0002JKQ8K',
 'B000E3FHJM',
 'B000GCUU98',
 'B000OG50Z0',
 'B000RGYIDM',
 'B000V5BXO6',
 'B0017QT5UM',
 'B001E7688Q',
 'B001MJT6IA',
 'B004GL7VE2',
 'B004OB2C3Y',
 'B004XJPVAS',
 'B004Y0V2C2',
 'B004ZEMHDA',
 'B0075HPJEI',
 'B008FIH4IK',
 'B008JHQ8HU',
 'B00BCBSJG8']