### Context

This dataset is from ecommerce website http://ecom.uelstore.com/
And community can use this dataset to build the recommendation system.

Volunteers are mainly students of the University of Economics - Law, Vietnam National University, Ho Chi Minh City, using their accounts to register and conduct ratings.

### Content
We provide 3 file datasets with JSon format
-678 users (id, nickname)
-732 products of all kinds (id, name)
-130754 product - reviews (rating, date)


### Inspiration

This is use to build recommendation

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('input/dataset/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input/dataset/customers.json
input/dataset/products.json
input/dataset/ratings.json


# Read Customers

In [3]:
customers=pd.read_json("input/dataset/customers.json")

In [4]:
customers.size

1356

In [5]:
customers

Unnamed: 0,Id,NickName
0,103603,1000kgthanh
1,103760,999999999ok
2,103829,ac7ive
3,1,admin
4,103839,ahkk.nguyen
...,...,...
673,103904,yenxaome
674,103610,yoneteru
675,103718,young.che
676,23251,zeatop939


# Read Products

In [6]:
products=pd.read_json("input/dataset/products.json")

In [7]:
products

Unnamed: 0,Id,Name,UnitPrice
0,1,Build your own computer,1200.0
1,2,Digital Storm VANQUISH 3 Custom Performance PC,1259.0
2,3,Lenovo IdeaCentre 600 All-in-One PC,500.0
3,4,Apple MacBook Pro 13-inch,1800.0
4,5,Asus N551JK-XO076H Laptop,1500.0
...,...,...,...
686,687,Bird Box,1.0
687,688,Snowpiercer,2.0
688,689,Edge of Tomorrow,1.0
689,690,Ponyo,3.0


In [8]:
products.size

2073

# Read Ratings

In [9]:
ratings=pd.read_json("input/dataset/ratings.json")

In [10]:
ratings

Unnamed: 0,CustomerID,ProductID,Rate,CreateDate
0,103416,619,1,2018/01/01 01:36:30
1,103654,411,1,2018/01/01 01:36:35
2,103954,298,3,2018/01/01 01:36:38
3,103672,361,5,2018/01/01 01:37:15
4,103960,536,5,2018/01/01 02:36:25
...,...,...,...,...
130749,103907,501,1,2022/03/16 22:25:10
130750,103907,200,1,2022/03/16 22:49:28
130751,103907,184,1,2022/03/16 22:53:35
130752,103907,211,1,2022/03/16 23:14:47


We don't care CreateDate attribute, so we remove this column for ratings

In [11]:
ratings.drop('CreateDate', inplace=True, axis=1)

In [12]:
ratings

Unnamed: 0,CustomerID,ProductID,Rate
0,103416,619,1
1,103654,411,1
2,103954,298,3
3,103672,361,5
4,103960,536,5
...,...,...,...
130749,103907,501,1
130750,103907,200,1
130751,103907,184,1
130752,103907,211,1


In [13]:
class MatrixFactorization(object):
    def __init__(self, Y,customers,products, K, lam = 0.1, Xinit = None, Winit = None, learning_rate = 0.5, max_iter = 1000, print_every = 100):
        self.Y = Y # represents the utility matrix
        self.K = K
        self.lam = lam # regularization parameter
        self.learning_rate = learning_rate # for gradient descent
        self.max_iter = max_iter # maximum number of iterations
        self.print_every = print_every # print loss after each a few iters
        self.customers=customers
        self.products=products
        self.n_users = int(np.max(Y[:, 0])) + 1
        self.n_items = int(np.max(Y[:, 1])) + 1
        #self.n_users = customers.size
        #self.n_items = products.size
        self.n_ratings = Y.shape[0] # number of known ratings
        self.X = np.random.randn(self.n_items, K) if Xinit is None\
        else Xinit
        self.W = np.random.randn(K, self.n_users) if Winit is None\
        else Winit
        self.b = np.random.randn(self.n_items) # item biases
        self.d = np.random.randn(self.n_users) # user biases
    def loss(self):
        L = 0
        for i in range(self.n_ratings):
            # user_id, item_id, rating
            n, m, rating = int(self.Y[i,0]), int(self.Y[i,1]), self.Y[i,2]
            L += 0.5*(self.X[m].dot(self.W[:, n])\
            + self.b[m] + self.d[n] - rating)**2
        L /= self.n_ratings
        # regularization, don’t ever forget this
        return L + 0.5*self.lam*(np.sum(self.X**2) + np.sum(self.W**2))
    def updateXb(self):
        products = np.array(self.products["Id"])
        for m in range(0,products.size):
            pId = products[m]
            # get all users who rated item m and corresponding ratings
            ids = np.where(self.Y[:, 1] == pId)[0] # row indices of items m
            if ids.size>0:
                user_ids, ratings=self.Y[ids, 0].astype(np.int32),self.Y[ids, 2]
                Wm, dm = self.W[:, user_ids], self.d[user_ids]
                for i in range(30): # 30 iteration for each sub problem
                    xm = self.X[m]
                    error = xm.dot(Wm) + self.b[m] + dm - ratings
                    grad_xm = error.dot(Wm.T)/self.n_ratings + self.lam*xm
                    grad_bm = np.sum(error)/self.n_ratings
                    # gradient descent
                    self.X[m] -= self.learning_rate*grad_xm.reshape(-1).astype('float64')
                    self.b[m] -= self.learning_rate*grad_bm
    def updateWd(self): # and d
        customers = np.array(self.customers["Id"])
        for n in range(0,customers.size):
            custId=customers[n]
            # get all items rated by user n, and the corresponding ratings
            ids = np.where(self.Y[:,0] == custId)[0] #indexes of items rated by n
            if ids.size>0:
                item_ids,ratings=self.Y[ids, 1].astype(np.int32), self.Y[ids, 2]
                Xn, bn = self.X[item_ids], self.b[item_ids]
                for i in range(30): # 30 iteration for each sub problem
                    wn = self.W[:, n]
                    error = Xn.dot(wn) + bn + self.d[n] - ratings
                    grad_wn = Xn.T.dot(error)/self.n_ratings + self.lam*wn
                    grad_dn = np.sum(error)/self.n_ratings
                    grad_dn=grad_dn
                    # gradient descent
                    self.W[:, n] -= self.learning_rate*grad_wn.reshape(-1).astype('float64')
                    self.d[n] -= self.learning_rate*grad_dn
    def fit(self):
        for it in range(self.max_iter):
            self.updateWd()
            self.updateXb()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y)
                print("iter = %d, loss = %.4f, RMSE train = %.4f"%(it + 1,
                self.loss(), rmse_train))
    def predict(self, u, i):
        """
        predict the rating of user u for item i
        """
        try:
            u, i = int(u), int(i)
            pred = self.X[i, :].dot(self.W[:, u]) + self.b[i] + self.d[u]
            return max(0, min(5, pred))  # 5-scale in Ecommerce
        except:
            return  0        
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0] # number of test
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.predict(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2
        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [14]:
rate_train =ratings[0:129000]
rate_train = np.array(rate_train)
rate_test = ratings[129001:]
rate_test= np.array(rate_test)
print('Number of traing rates:', rate_train.shape[0])
print('Number of test rates:', rate_test.shape[0])

Number of traing rates: 129000
Number of test rates: 1753


In [15]:
# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1
mf = MatrixFactorization(rate_train,customers,products,K = 50, lam = .01, print_every = 5, learning_rate = 50,max_iter = 30)
mf.fit()
# evaluate on test data
RMSE = mf.evaluate_RMSE(rate_test)
print("\nMatrix Factorization CF, RMSE = %.4f" %RMSE)

iter = 5, loss = 25919.5239, RMSE train = 1.7961
iter = 10, loss = 25919.5239, RMSE train = 1.7961
iter = 15, loss = 25919.5239, RMSE train = 1.7961
iter = 20, loss = 25919.5239, RMSE train = 1.7961
iter = 25, loss = 25919.5239, RMSE train = 1.7961
iter = 30, loss = 25919.5239, RMSE train = 1.7961

Matrix Factorization CF, RMSE = 1.8589


In [16]:
rate_test

array([[103400,    462,      4],
       [103400,    288,      4],
       [103400,    475,      4],
       ...,
       [103906,    183,      1],
       [103906,    210,      1],
       [103906,    165,      1]], dtype=int64)

# Test Recommendation

Test 10 Customer

In [17]:
expected_score=3.8
print("Expected Score =",expected_score)
for c in customers.values[0:10]:
    customerId=c[0]
    customerName=c[1]
    print("Customer [",customerId,customerName,"], recommendation products:")
    for p in products.values:
        productId=p[0]
        productName=p[1]
        result=mf.predict(customerId,productId)
        if result>=expected_score:
            print("\t Recommend Product [",productName, "] Score=",result)

Expected Score = 3.8
Customer [ 103603 1000kgthanh ], recommendation products:
Customer [ 103760 999999999ok ], recommendation products:
Customer [ 103829 ac7ive ], recommendation products:
Customer [ 1 admin ], recommendation products:
Customer [ 103839 ahkk.nguyen ], recommendation products:
	 Recommend Product [ Ponyo ] Score= 5
Customer [ 103981 akyshin ], recommendation products:
	 Recommend Product [ Build your own computer ] Score= 4.054357987888604
	 Recommend Product [ Digital Storm VANQUISH 3 Custom Performance PC ] Score= 4.234840612785966
	 Recommend Product [ Lenovo IdeaCentre 600 All-in-One PC ] Score= 4.164122375567865
	 Recommend Product [ Apple MacBook Pro 13-inch ] Score= 4.065152073037899
	 Recommend Product [ Asus N551JK-XO076H Laptop ] Score= 4.0572496148525365
	 Recommend Product [ Samsung Series 9 NP900X4C Premium Ultrabook ] Score= 3.917828051935049
	 Recommend Product [ HP Spectre XT Pro UltraBook ] Score= 4.242279949254149
	 Recommend Product [ HP Envy 6-1180c

# Comparing Other Models

In [18]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

# Ratings Distribution

In [19]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = ratings['Rate'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / ratings.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Distribution Of {} ratings'.format(ratings.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'),
              width=500,
              height=500,
              autosize=False,
             )
# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()

In [20]:
data

Rate
5    35512
4    30458
3    15838
2    21070
1    27876
Name: count, dtype: int64

In [21]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['CustomerID','ProductID','Rate']], reader)

In [None]:
data

In [22]:
from surprise import Dataset, KNNBaseline, Reader

reader = Reader(rating_scale=(1, 5))

rate_train =ratings[0:104603]
#rate_train = np.array(rate_train)
rate_test = ratings[104604:]
#rate_test= np.array(rate_test)

train_Dataset = Dataset.load_from_df(rate_train[['CustomerID','ProductID','Rate']], reader)
valid_Dataset = Dataset.load_from_df(rate_test[['CustomerID','ProductID','Rate']], reader)

In [23]:
rate_train

Unnamed: 0,CustomerID,ProductID,Rate
0,103416,619,1
1,103654,411,1
2,103954,298,3
3,103672,361,5
4,103960,536,5
...,...,...,...
104598,103391,498,4
104599,103883,666,5
104600,103749,372,3
104601,103394,153,3


In [24]:
rate_test

Unnamed: 0,CustomerID,ProductID,Rate
104604,103415,266,3
104605,103687,419,4
104606,103730,603,2
104607,103862,372,2
104608,14902,166,1
...,...,...,...
130749,103907,501,1
130750,103907,200,1
130751,103907,184,1
130752,103907,211,1


In [25]:
train_Dataset = train_Dataset.build_full_trainset()

In [26]:
algo = KNNBaseline()
algo.fit(train_Dataset)


Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1bc74832500>

In [27]:
testset = [valid_Dataset.df.iloc[i].to_list() for i in range(len(valid_Dataset.df))]

In [28]:
predictions=algo.test(testset)

In [29]:
acrmse=accuracy.rmse(predictions)
print(acrmse)

acmse=accuracy.mse(predictions)
print(acmse)

acmae=accuracy.mae(predictions)
print(acmae)

RMSE: 1.0753
1.0753009768574298
MSE: 1.1563
1.1562721908305427
MAE:  0.7782
0.7781571143764785


In [30]:
prediction=algo.test([[103416,619,1]])

In [31]:
prediction

[Prediction(uid=103416, iid=619, r_ui=1, est=2.180316384116525, details={'actual_k': 40, 'was_impossible': False})]

In [32]:
acrmse=accuracy.rmse(prediction)
print(acrmse)

RMSE: 1.1803
1.1803163841165252


In [33]:
# Make sure these two are in the training focus , And the real value is 4
uid = 103416 # user 
iid = 619 # goods 
# Get forecasts for specific users and items 
pred = algo.predict(uid, iid, r_ui=1, verbose=True)

user: 103416     item: 619        r_ui = 1.00   est = 2.18   {'actual_k': 40, 'was_impossible': False}


In [34]:
# Make sure these two are in the training focus , And the real value is 4
uid = 103954 # user 
iid = 298 # goods 
# Get forecasts for specific users and items 
pred = algo.predict(uid, iid, r_ui=3, verbose=True)

user: 103954     item: 298        r_ui = 3.00   est = 3.26   {'actual_k': 40, 'was_impossible': False}


https://surprise.readthedocs.io/en/stable/accuracy.html

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


reader = Reader(rating_scale=(1, 5))

rate_train =ratings[0:129000]

rate_test = ratings[129001:]


train_Dataset = Dataset.load_from_df(rate_train[['CustomerID','ProductID','Rate']], reader)
valid_Dataset = Dataset.load_from_df(rate_test[['CustomerID','ProductID','Rate']], reader)

train_Dataset = train_Dataset.build_full_trainset()

testset = [valid_Dataset.df.iloc[i].to_list() for i in range(len(valid_Dataset.df))]

benchmark = []

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    
    algorithm.fit(train_Dataset)
    
    predictions=algorithm.test(testset)
    
    acrmse=accuracy.rmse(predictions)
    
    acmse=accuracy.mse(predictions)
    
    acmae=accuracy.mae(predictions)
    
    metrics=[str(algorithm).split(' ')[0].split('.')[-1],acrmse,acmse,acmae]
    
    benchmark.append(metrics)   

print ('\n\tDONE\n')

benchmark

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
report=pd.DataFrame(benchmark,columns=["Algorithm","RMSE","MSE","MAE"])

report.set_index("Algorithm")

# Comparing all

In [36]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

arrTrainFile=["input/trainset/Rating_0.1_117671_train.json",
          "input/trainset/Rating_0.2_104667_train.json",
          "input/trainset/Rating_0.3_91601_train.json",
          "input/trainset/Rating_0.4_78326_train.json",
          "input/trainset/Rating_0.5_65268_train.json",
          "input/trainset/Rating_0.6_52178_train.json",
          "input/trainset/Rating_0.7_39146_train.json",
          "input/trainset/Rating_0.8_26187_train.json",
          "input/trainset/Rating_0.9_12940_train.json"
         ]

arrTestFile=["input/trainset/Rating_0.1_13083_test.json",
          "input/trainset/Rating_0.2_26087_test.json",
          "input/trainset/Rating_0.3_39153_test.json",
          "input/trainset/Rating_0.4_52428_test.json",
          "input/trainset/Rating_0.5_65486_test.json",
          "input/trainset/Rating_0.6_78576_test.json",
          "input/trainset/Rating_0.7_91608_test.json",
          "input/trainset/Rating_0.8_104567_test.json",
          "input/trainset/Rating_0.9_117814_test.json"
         ]

reader = Reader(rating_scale=(1, 5))

nFile=len(arrTestFile)
for i in range(0,nFile):
    trainFile=arrTrainFile[i]
    testFile=arrTestFile[i]
    
    with open(trainFile, encoding='utf-8-sig') as f:
        data = json.load(f)
    rate_train =pd.DataFrame(data)
    
    with open(testFile, encoding='utf-8-sig') as f:
        data = json.load(f)
    rate_test =pd.DataFrame(data)


    train_Dataset = Dataset.load_from_df(rate_train[['CustomerID','ProductID','Rate']], reader)
    valid_Dataset = Dataset.load_from_df(rate_test[['CustomerID','ProductID','Rate']], reader)

    train_Dataset = train_Dataset.build_full_trainset()

    testset = [valid_Dataset.df.iloc[i].to_list() for i in range(len(valid_Dataset.df))]

    benchmark = []

    #algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]
    algorithms = [SVD(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

    
    print ("Attempting: ", str(algorithms), '\n\n\n')
    
    for algorithm in algorithms:
        print("Starting: " ,str(algorithm))

        algorithm.fit(train_Dataset)

        predictions=algorithm.test(testset)

        acrmse=accuracy.rmse(predictions)

        acmse=accuracy.mse(predictions)

        acmae=accuracy.mae(predictions)

        metrics=[str(algorithm).split(' ')[0].split('.')[-1],acrmse,acmse,acmae]

        benchmark.append(metrics)   

    print ('\n\tDONE\n')
    report=pd.DataFrame(benchmark,columns=["Algorithm","RMSE","MSE","MAE"])
    report.set_index("Algorithm")
    print("-------------------------------------------------")
    print ("train file: ", trainFile)
    print ("test file: ", testFile)
    print(report)

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001BC74830A30>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001BC748335E0>, <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001BC77AA79D0>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001BC77AA7970>, <surprise.prediction_algorithms.knns.KNNBaseline object at 0x000001BC77AA4130>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x000001BC77AA7A00>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x000001BC77AA7A90>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x000001BC77AA7A30>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001BC77AA6B90>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000001BC77AA6830>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001BC74830A30>
RMSE: 1.2105
MSE: 1.4653
MAE:  0.85

KeyboardInterrupt: 