In [1]:
import pandas as pd
reviews_data = pd.read_csv("../data/raw/Books_rating.csv")

In [10]:
M = reviews_data.Id.unique().shape[0]
print(f"The number of items M = {M}")

The number of items M = 221998


In [9]:
N = reviews_data.User_id.unique().shape[0]
print(f"The number of users N = {N}")

The number of users N = 1008973


In [13]:
print(f"The size of the sparse weight matrix is MxN, which contains {M * N} cells")

The size of the sparse weight matrix is MxN, which contains 223989988054 cells


In [30]:
def calculate_array_size(data_type, num_items):
    bytes_per_item = np.dtype(data_type).itemsize
    total_bytes = bytes_per_item * num_items
    total_gb = total_bytes / (1024**3)
    return total_gb

# Usage example
data_type = np.int8
num_items = M*N
array_size_in_gb = calculate_array_size(data_type, num_items)

print(f"The array will be approximately {array_size_in_gb:.2f} GB")

The array will be approximately 208.61 GB


We have a lot of items and users. We will need an efficient way of performing the calculations

Possible Filters for a pair of i and i' users:
- Threshold of common ratings: If two users have less than a certain amount of common reviewed items, they won't be considered.
- Threshold of weights: If the absolute value of the weight for two users is less than a threshold, it won't be considered.
- Theshold of neighbors: Only the top absolute value weights for a user will be considered.

We still don't know the distributions of many of these variables, but we can get an upper bound using the threshold of neighbors

In [52]:
pd.DataFrame(
    [(n,calculate_array_size(data_type, N * n)) for n in range(1,520,20)],
    columns=["neighbors", "size (GB)"]
)

Unnamed: 0,neighbors,size (GB)
0,1,0.00094
1,21,0.019733
2,41,0.038527
3,61,0.05732
4,81,0.076114
5,101,0.094908
6,121,0.113701
7,141,0.132495
8,161,0.151288
9,181,0.170082


We can see that the use of this threshold already gives us a size that we can work on

## Working with a sample

In this section we will work with a sample of users with the highest count of reviews, to minimize the sparcity of the weight matrix.

In [50]:
top_active_users = reviews_data.User_id.value_counts().head(int(N/100)).index

In [51]:
top_active_users

Index(['A14OJS0VWMOSWO', 'AFVQZQ8PW0L', 'A1D2C0WDCSHUWZ', 'AHD101501WCN1',
       'A1X8VZWTOG8IS6', 'A1K1JW1C5CUSUZ', 'A20EEWWSFMZ1PN', 'A1S3C5OFU508P3',
       'A1N1YEMTI9DJ86', 'A2OJW07GQRNJUT',
       ...
       'A3C9GITUZ3V8GT', 'A325I3POKV8LR', 'A1IZ6OYX3ZP7RC', 'ARER05SO3BWWJ',
       'A2O8K7L29RLY54', 'A32NR5QPDZSD5D', 'ABZF8NDWIRVT2', 'A8QMDH75AKI5B',
       'A1631PDU0WGN4W', 'A2TCUQOY9M7SXC'],
      dtype='object', name='User_id', length=10089)