In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import blosc
import pickle

In [45]:
df = pq.read_table('./sl_rating.parquet').to_pandas()

In [46]:
df.head()

Unnamed: 0,movie_id,user_id,rating,date
52525,28,1990901,5,2004-02-16
52528,28,765331,3,2003-03-20
52529,28,1987434,4,2005-07-14
52532,28,2193455,4,2005-01-19
52533,28,1468812,4,2003-01-14


### Add indices to users and movies

In [47]:
num_movies = len(df.movie_id.unique())
num_users = len(df.user_id.unique())

In [48]:
num_movies, num_users

(1000, 10000)

In [49]:
movie_to_idx = {m:idx for idx,m in enumerate(df.movie_id.unique())}
idx_to_movie = {idx:m for idx,m in enumerate(df.movie_id.unique())}

user_to_idx = {m:idx for idx,m in enumerate(df.user_id.unique())}
idx_to_user = {idx:m for idx,m in enumerate(df.user_id.unique())}

In [50]:
df['movie_idx']=df.movie_id.map(movie_to_idx)
df['user_idx']=df.user_id.map(user_to_idx)

In [51]:
df.shape

(6491181, 6)

In [52]:
df.head()

Unnamed: 0,movie_id,user_id,rating,date,movie_idx,user_idx
52525,28,1990901,5,2004-02-16,0,0
52528,28,765331,3,2003-03-20,0,1
52529,28,1987434,4,2005-07-14,0,2
52532,28,2193455,4,2005-01-19,0,3
52533,28,1468812,4,2003-01-14,0,4


In [53]:
table = pa.Table.from_pandas(df)
pq.write_table(table, '../data/rating_indexed.parquet')

In [54]:
dictionaries = {'movie_to_idx':movie_to_idx,
                'idx_to_movie':idx_to_movie,
                'user_to_idx':user_to_idx,
                'idx_to_user':idx_to_user}

In [55]:
pickle.dump(dictionaries,open('sl_dictionaries.pkl','wb'))

## Compression with BLOSC

In [56]:
df.shape

(6491181, 6)

In [57]:
num_users = df.user_idx.max()+1
num_movies = df.movie_idx.max()+1

In [58]:
num_movies, num_users

(1000, 10000)

In [60]:
size=df.shape[0]
size

6491181

In [61]:
ratings = df.rating.values.astype(np.int8)
users = df.user_idx.values.astype(np.int32)
movies = df.movie_idx.values.astype(np.int16)

In [62]:
ratings.dtype,users.dtype,movies.dtype

(dtype('int8'), dtype('int32'), dtype('int16'))

In [24]:
a=ratings
packed = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize, 9, True, cname='zstd')
with pa.OSFile('ratings.dat', 'wb') as f:
    f.write(packed)

In [25]:
a=users
packed = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize, 9, True, cname='zstd')
with pa.OSFile('users.dat', 'wb') as f:
    f.write(packed)

In [26]:
a=movies
packed = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize, 9, True, cname='zstd')
with pa.OSFile('movies.dat', 'wb') as f:
    f.write(packed)

## Testing blosc uncompressing

In [67]:
size = size
rating_rec = np.empty(size,dtype=np.int8)
movie_rec = np.empty(size,dtype=np.int16)
user_rec = np.empty(size,dtype=np.int32)

In [68]:
rating_file = 'ratings.dat'
users_file = 'users.dat'
movies_file = 'movies.dat'

blosc.decompress_ptr(pa.OSFile(users_file).readall(), user_rec.__array_interface__['data'][0])
blosc.decompress_ptr(pa.OSFile(movies_file).readall(), movie_rec.__array_interface__['data'][0])
blosc.decompress_ptr(pa.OSFile(rating_file).readall(), rating_rec.__array_interface__['data'][0])

6491181

In [40]:
rating_rec

array([5, 3, 4, ..., 4, 4, 1], dtype=int8)

In [41]:
ratings

array([5, 3, 4, ..., 4, 4, 1], dtype=int8)

In [42]:
all(rating_rec==ratings)

True

In [43]:
all(user_rec==users)

True

In [44]:
all(movie_rec==movies)

True

In [64]:
df.rating.unique()

array(['5', '3', '4', '1', '2'], dtype=object)

In [69]:
np.unique(rating_rec)

array([1, 2, 3, 4, 5], dtype=int8)