In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
dataset = pd.read_csv(r"C:\Users\User\Downloads\upload\recommender_system\ratings_Beauty.csv")

In [4]:
dataset

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,0205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,0558925278,3.0,1355443200
2,A1Z513UWSAAO0F,0558925278,5.0,1404691200
3,A1WMRR494NWEWV,0733001998,4.0,1382572800
4,A3IAAVS479H7M7,0737104473,1.0,1274227200
...,...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0,1405296000
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0,1405296000
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0,1405382400
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0,1405555200


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2023070 entries, 0 to 2023069
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   UserId     object 
 1   ProductId  object 
 2   Rating     float64
 3   Timestamp  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 61.7+ MB


In [6]:
dataset.isnull().sum()

UserId       0
ProductId    0
Rating       0
Timestamp    0
dtype: int64

In [7]:
dataset['UserId'].nunique(), dataset['UserId'].value_counts().max(), dataset['UserId'].value_counts().min()

(1210271, 389, 1)

In [8]:
dataset['ProductId'].nunique(), dataset['ProductId'].value_counts().max(), dataset['ProductId'].value_counts().min()

(249274, 7533, 1)

In [9]:
user_frequency = pd.DataFrame(dataset.groupby('UserId')['ProductId'].nunique() > 1).reset_index()
old_user = list(user_frequency['UserId'][user_frequency['ProductId']])

In [10]:
item_frequency = pd.DataFrame(dataset.groupby('ProductId')['UserId'].nunique() > 1).reset_index()
old_item = list(item_frequency['ProductId'][item_frequency['UserId']])

In [11]:
dataset = dataset[dataset['UserId'].isin(old_user)]

In [12]:
dataset = dataset[dataset['ProductId'].isin(old_item)]

In [13]:
dataset.shape

(1069875, 4)

In [14]:
dataset = dataset.drop(columns='Timestamp')

In [15]:
# Convert UserId and ProductId from str to int
dataset['UserId'] = dataset['UserId'].astype('category').cat.codes
dataset['ProductId'] = dataset['ProductId'].astype('category').cat.codes

In [16]:
dataset

Unnamed: 0,UserId,ProductId,Rating
1,216177,0,3.0
9,174839,1,4.0
10,27051,1,5.0
11,279924,1,4.0
14,311302,2,5.0
...,...,...,...
2023056,113194,133210,5.0
2023057,43897,133210,5.0
2023060,204475,133210,5.0
2023063,134792,133210,5.0


### Autoencoder

In [17]:
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz
from scipy.sparse import save_npz, load_npz
import keras.backend as K
from sklearn.utils import shuffle

In [18]:
N = dataset['UserId'].nunique() + 1     # number of users
M = dataset['ProductId'].nunique() + 1  # number of items

In [19]:
# split into train and test
df = shuffle(dataset)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

### Preprocessing

In [20]:
# Get rid of sparse/zero value, convert matrix to sparse format
# Our aim is to fill in (reconstruct) missing ratings, the zero values (missing field) should not be interpreted as a value
# More effiecient, save memory and faster computation

A = lil_matrix((N,M))   # initialise sparse matrix
count = 0

In [21]:
def update_train(row):
    global count
    count += 1
    # print progress every 100,000
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/cutoff))

    # fill in rating data into lil_matix (list of lists)
    i = int(row['UserId'])
    j = int(row['ProductId'])
    A[i, j] = row['Rating']

df_train.apply(update_train, axis=1)


processed: 0.117
processed: 0.234
processed: 0.351
processed: 0.467
processed: 0.584
processed: 0.701
processed: 0.818
processed: 0.935


1641097    None
343862     None
1132293    None
1713720    None
995742     None
           ... 
1473064    None
186263     None
776926     None
855916     None
1115821    None
Length: 855900, dtype: object

In [22]:
# csr format
# - data array
#  - row pointers
#  - column indices

In [23]:
A = A.tocsr()   # convert to CSR format for faster matrix operations
# mask = (A > 0)  # mask, to tell which entries exist and which do not
save_npz("Atrain.npz", A)

In [45]:
# initialise a sparse matrix for test set
A_test = lil_matrix((N, M))
count = 0

In [46]:
def update_test(row):
    global count
    count += 1
    # print progress every 100,000
    if count % 100000 == 0:
        print("processed: %.3f" % (float(count)/len(df_test)))

    # fill in rating data into lil_matix (list of lists)
    i = int(row['UserId'])
    j = int(row['ProductId'])
    A_test[i, j] = row['Rating']

df_test.apply(update_test, axis=1)

processed: 0.467
processed: 0.935


1723873    None
911158     None
677201     None
528456     None
1107983    None
           ... 
1981983    None
388045     None
1696563    None
471846     None
1093670    None
Length: 213975, dtype: object

In [47]:
A_test = A_test.tocsr()
# mask_test = (A_test > 0)
save_npz("Atest.npz", A_test)

In [48]:
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD

In [49]:
# define config
batch_size = 128
epochs = 20
reg = 0.0001
# reg = 0

In [50]:
# load saved sparse
A = load_npz("Atrain.npz")
A_test = load_npz("Atest.npz")

# mask, to tell which entries exist and which do not
mask = (A > 0) * 1.0
mask_test = (A_test > 0) * 1.0

In [51]:
# make copies before shuffle
A_copy = A.copy()
mask_copy = mask.copy()
A_test_copy = A_test.copy
mask_test_copy = mask_test.copy()

In [52]:
N, M = A.shape
print("N:", N, "M:", M)
print("N // batch_size:", N // batch_size)

N: 321367 M: 133212
N // batch_size: 2510


In [53]:
# center the data
mu = A.sum() / mask.sum()
print("mu:", mu)

mu: 4.198586283444327


In [54]:
# build 1 hidden layer autoencoder
i = Input(shape=(M,))
# bigger hidden layer size seems to help!
x = Dropout(0.7)(i)
x = Dense(700, activation='tanh', kernel_regularizer=l2(reg))(x)
# x = Dropout(0.5)(x)
x = Dense(M, kernel_regularizer=l2(reg))(x)

In [55]:
def custom_loss(y_true, y_pred):
    mask = K.cast(K.not_equal(y_true, 0), dtype='float32')

    diff = y_pred - y_true
    sqdiff = diff * diff * mask

    sse = K.sum(K.sum(sqdiff))  # 2D sum, then 1D (vector) sum / or use axis=None
    n = K.sum(K.sum(mask))
    return sse / n

In [56]:
def generator(A, M):
    while True:
        A, M = shuffle(A, M)
        for i in range(A.shape[0] // batch_size + 1):
            upper = min((i+1)*batch_size, A.shape[0])   # get index
            # batch by batch processing
            a = A[i*batch_size:upper].toarray()
            m = M[i*batch_size:upper].toarray()
            
            a = a - mu * m
            # m2 = (n.random.random(a.shape) > 0.5)
            # noisy = a * m2
            noisy = a   # no noise
            yield noisy, a

In [57]:
def test_generator(A, M, A_test, M_test):
    while True:
        for i in range(A.shape[0] // batch_size + 1):
            upper = min((i+1)*batch_size, A.shape[0])   # get index
            # batch by batch processing
            a = A[i*batch_size:upper].toarray()
            m = M[i*batch_size:upper].toarray()
            at = A_test[i*batch_size:upper].toarray()
            mt = M_test[i*batch_size:upper].toarray()

            a = a - mu * m           # train set
            at = at - mu * mt        # test set
            yield a, at

In [58]:
model = Model(i, x)
model.compile(
    loss=custom_loss,
    optimizer=SGD(lr=0.08, momentum=0.9),
    # optimizer='adam',
    metrics=[custom_loss]
)

  super().__init__(name, **kwargs)


In [None]:
r = model.fit(
    generator(A, mask),
    validation_data = test_generator(A_copy, mask_copy, A_test_copy, mask_test_copy),
    epochs = epochs,
    steps_per_epoch = A.shape[0] / batch_size + 1,
    validation_steps = A_test.shape[0] // batch_size + 1,
)

Epoch 1/20

In [None]:
print(r.history.keys())


NameError: name 'r' is not defined

In [None]:
# plot losses
plt.plot(r.history['loss'], label='train loss')
plt.plot(r.history['val_loss'], label='test loss')
plt.legend()
plt.show()

In [None]:
# plot mse
plt.plot(r.history['custom_loss'], label="train_mse")
plt.plot(r.history['val_custom_loss'], label="test_mse")
plt.legend()
plt.show()

- The autoencoder doesn’t explicitly learn user-item relationships like collaborative filtering does.
- It just tries to compress the rating matrix and reconstruct it, without knowing that rows are users and columns are items.
- This means the semantic meaning of users & items is lost—it treats the matrix just like any image pixel matrix.

🚨 Issue with Using Autoencoders for Recommendations:
- Unlike images, where pixels next to each other have spatial meaning, the order of users and items in the rating matrix is arbitrary.

- The autoencoder might learn some latent patterns in the rating distribution.
- But it won’t understand user preferences or item similarities explicitly.
- This is why autoencoders can work well for image regeneration but not necessarily for recommendations.

### Step 1: Input Batch (Generator)
- generator(A, M) yields batches of (noisy_A, A), where:
$$ noisy_A: The input rating matrix (with some normalization applied). $$
$$ A: The original rating matrix (used as ground truth for reconstruction). $$

### Step 2: Encoding (Compression)
- The autoencoder reduces the dimensionality of noisy_A, creating a latent representation (hidden compressed features).

### Step 3: Decoding (Reconstruction)
- The model reconstructs a rating matrix (same shape as A), trying to match the original ratings.