# Data Analysis

In [None]:
import numpy as np
import pandas as pd
import kagglehub
import warnings
warnings.filterwarnings('ignore')

In [None]:
path = kagglehub.dataset_download("saurav9786/amazon-product-reviews")
products = pd.read_csv(path + '/ratings_Electronics (1).csv', header=None)

Downloading from https://www.kaggle.com/api/v1/datasets/download/saurav9786/amazon-product-reviews?dataset_version_number=1...


100%|██████████| 109M/109M [00:05<00:00, 20.0MB/s]

Extracting files...





In [None]:
products

Unnamed: 0,user_id,product_id,rating
0,AKM1MP6P0OYPR,0132793040,5.0
1,A2CX7LUOHB2NDG,0321732944,5.0
2,A2NWSAGRHCP8N5,0439886341,1.0
3,A2WNBOD3WNDNKT,0439886341,3.0
4,A1GI0U4ZRJA8WN,0439886341,1.0
...,...,...,...
7824477,A2YZI3C9MOHC0L,BT008UKTMW,5.0
7824478,A322MDK0M89RHN,BT008UKTMW,5.0
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0
7824480,A10M2KEFPEQDHN,BT008UKTMW,4.0


In [None]:
products.rename(columns={0:'user_id', 1:'product_id', 2:'rating', 3:'timestamp'}, inplace=True)

In [None]:
products.drop(columns=['timestamp'], inplace=True)

In [None]:
# Eliminate users and products with minimum interaction
min_users_rating = 40
min_ratings_product = 10
user_counts = products["user_id"].value_counts()
product_counts = products["product_id"].value_counts()

products = products[
    (products["user_id"].isin(user_counts[user_counts >= min_users_rating].index))&
    (products["product_id"].isin(product_counts[product_counts >= min_ratings_product].index))
].reset_index(drop=True)

In [None]:
products

Unnamed: 0,user_id,product_id,rating
0,A3BY5KCNQZXV5U,0594451647,5.0
1,AT09WGFUM934H,0594481813,3.0
2,A3BMUBUC1N77U8,0972683275,4.0
3,A6J8D9V5S9MBE,0972683275,5.0
4,A3CLWR1UUZT6TG,0972683275,5.0
...,...,...,...
153120,A26VF18X91983P,B00L3YHF6O,5.0
153121,A2XRMQA6PJ5ZJ8,B00L3YHF6O,5.0
153122,A3A4ZAIBQWKOZS,B00L3YHF6O,5.0
153123,AOVTLYTHVDNUX,B00L3YHF6O,5.0


In [None]:
products.rating.value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5.0,87201
4.0,38518
3.0,14654
2.0,6523
1.0,6229


In [None]:
products.isna().sum()

Unnamed: 0,0
user_id,0
product_id,0
rating,0


# Rank Based Recommendation System

In [None]:
average_rating = products.groupby('product_id')['rating'].mean()

count_rating = products.groupby('product_id')['rating'].count()

ratings = pd.DataFrame({'avg_rating': average_rating, 'rating_count': count_rating})

In [None]:
interactions = 50
no_recommendations = 7
ratings[ratings['rating_count'] > interactions].sort_values(by=['avg_rating', 'rating_count'], ascending=False)[:no_recommendations]

Unnamed: 0_level_0,avg_rating,rating_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B0052SCU8U,4.957143,70
B001TH7GUU,4.882883,111
B001TH7T2U,4.87931,58
B0000BZL1P,4.875,72
B00BQ4F9ZA,4.852459,61
B003ES5ZUU,4.847059,255
B0019EHU8G,4.842857,140


# Collaborative filtering recommendation system

In [None]:
from sklearn.neighbors import NearestNeighbors

user_item_matrix = products.pivot(index='user_id', columns='product_id', values='rating').fillna(0)


In [None]:
user_item_matrix

product_id,0594451647,0594481813,0972683275,1400501466,1400501520,1400501776,1400532620,1400532655,140053271X,1400599997,...,B00KSLCU72,B00KVNY2KA,B00KWHMR6G,B00KWMNDDM,B00KYMCJF8,B00L21HC7A,B00L2442H0,B00L26YDA4,B00L3YHF6O,B00L403O94
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A100UD67AHFODS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100WO06OQR8BQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105S56ODHGJEK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A105TOJ6LTVMBG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A108XABRHAA9E7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZOK5STV85FBJ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZQGJ5CEAJGXB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZV2U6GU5QA6C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZYJE40XW6MFG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=10)
knn.fit(user_item_matrix)

In [None]:
def recommend_products(user_id, no_recommendations=2):
  user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)
  distances, indices = knn.kneighbors(user_vector, n_neighbors=10)

  similar_users = user_item_matrix.iloc[indices[0][1:]].index

  recommended_products = set()

  for sim_user in similar_users:
    user_rated_products = products[products["user_id"] == sim_user]["product_id"].tolist()
    recommended_products.update(user_rated_products)

  user_rated_products = products[products["user_id"] == user_id]["product_id"].tolist()
  recommended_products.difference_update(user_rated_products)

  return list(recommended_products)[:no_recommendations]

recommend_products("A100WO06OQR8BQ")

['B00763WNAO', 'B009NUK6S4']

In [None]:
def view_neighbors_rating(user_id, recommended_products, k=5):
  user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)
  distances, indices = knn.kneighbors(user_vector, n_neighbors=10)

  neighbors = indices[0][1:]
  neighbor_rating = user_item_matrix.iloc[neighbors]

  neighbors_ratings = {}

  for product_id in recommended_products:

    rating_for_product = neighbor_rating[product_id].dropna()

    if not rating_for_product.empty:
      neighbors_ratings[product_id] = rating_for_product.to_dict()
    else:
      neighbors_ratings[product_id] = "No ratings from neighbours"

  return neighbors_ratings


In [None]:
view_neighbors_rating("A100WO06OQR8BQ", ['B00763WNAO', 'B009NUK6S4'], k=10)

{'B00763WNAO': {'A3963R7EPE3A7E': 0.0,
  'A2Y3WWPUKIJ59I': 0.0,
  'A30UP2KKD5IQEP': 0.0,
  'A1CST2WUA32GP0': 0.0,
  'A1TP2RW7KDI5AZ': 4.0,
  'A298GL2D0BHGKZ': 0.0,
  'A15XI2BEGGFEOW': 0.0,
  'A2L4ZGN7GZJ95T': 0.0,
  'A3D822N1K2IAQD': 0.0},
 'B009NUK6S4': {'A3963R7EPE3A7E': 0.0,
  'A2Y3WWPUKIJ59I': 0.0,
  'A30UP2KKD5IQEP': 0.0,
  'A1CST2WUA32GP0': 0.0,
  'A1TP2RW7KDI5AZ': 0.0,
  'A298GL2D0BHGKZ': 0.0,
  'A15XI2BEGGFEOW': 0.0,
  'A2L4ZGN7GZJ95T': 0.0,
  'A3D822N1K2IAQD': 4.0}}

# Matrix factorization based method

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=7)
matrix_factorized = svd.fit_transform(user_item_matrix)

reconstructed_matrix = np.dot(matrix_factorized, svd.components_)

In [None]:
reconstructed_matrix[0].max()

np.float64(1.470485540765645)

In [None]:
reconstructed_matrix[0].argmax()

np.int64(20945)

In [None]:
matrix_factorized.shape

(2644, 7)

In [None]:
svd.components_.shape

(7, 41640)

In [None]:
reconstructed_matrix.shape

(2644, 41640)

In [None]:
def recommend_products_svd(user_id, no_recommendations=3):

  ## Get user index
  user_idx = user_item_matrix.index.get_loc(user_id)

  ## Get user predicted ratings for all products
  predicted_ratings = reconstructed_matrix[user_idx]

  ## Get products the user has already rated
  rated_products = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index

  ## Get products user has not rated yet
  unrated_products = user_item_matrix.columns.difference(rated_products)

  ## Create a list of predicted ratings for unrated products
  unrated_products_ratings = [(product, predicted_ratings[user_item_matrix.columns.get_loc(product)]) for product in unrated_products]


  # Sort by predicted rating
  unrated_products_ratings.sort(key=lambda x: x[1], reverse=True)

  # unrated_products_ratings
  return [(product, rating) for product, rating in unrated_products_ratings[:no_recommendations]]


recommend_products_svd("A100WO06OQR8BQ", no_recommendations=3)


[('B000N99BBC', np.float64(1.5432904658573514)),
 ('B004CLYEDC', np.float64(1.5217884623883016)),
 ('B00829TIEK', np.float64(1.258321367879577))]

In [None]:
ratings[ratings.index == 'B000N99BBC']

Unnamed: 0_level_0,avg_rating,rating_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B000N99BBC,4.768116,207


In [None]:
ratings[ratings.index == 'B004CLYEDC']

Unnamed: 0_level_0,avg_rating,rating_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B004CLYEDC,4.683824,136


In [None]:
ratings[ratings.index == 'B00829TIEK']

Unnamed: 0_level_0,avg_rating,rating_count
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1
B00829TIEK,4.407407,189
