In [16]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import blosc
import pickle

In [2]:
df = pq.read_table('../data/intermediate/rating.parquet').to_pandas()

In [3]:
df.head()

Unnamed: 0,movie_id,user_id,rating
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3


### Add indices to users and movies

In [4]:
num_movies = len(df.movie_id.unique())
num_users = len(df.user_id.unique())

In [5]:
num_movies, num_users

(17770, 480189)

In [6]:
movie_to_idx = {m:idx for idx,m in enumerate(df.movie_id.unique())}
idx_to_movie = {idx:m for idx,m in enumerate(df.movie_id.unique())}

user_to_idx = {m:idx for idx,m in enumerate(df.user_id.unique())}
idx_to_user = {idx:m for idx,m in enumerate(df.user_id.unique())}

In [7]:
df['movie_idx']=df.movie_id.map(movie_to_idx)
df['user_idx']=df.user_id.map(user_to_idx)

In [8]:
df.shape

(100480507, 5)

In [9]:
df.head()

Unnamed: 0,movie_id,user_id,rating,movie_idx,user_idx
0,1,1488844,3,0,0
1,1,822109,5,0,1
2,1,885013,4,0,2
3,1,30878,4,0,3
4,1,823519,3,0,4


In [None]:
table = pa.Table.from_pandas(df)
pq.write_table(table, '../data/intermediate/rating_indexed.parquet')

In [14]:
dictionaries = {'movie_to_idx':movie_to_idx,
                'idx_to_movie':idx_to_movie,
                'user_to_idx':user_to_idx,
                'idx_to_user':idx_to_user}

In [18]:
pickle.dump(dictionaries,open('dictionaries.pkl','wb'))

## Compression with BLOSC

In [24]:
df.shape

(100480507, 5)

In [10]:
num_users = df.user_idx.max()+1
num_movies = df.movie_idx.max()+1

In [11]:
num_movies, num_users

(17770, 480189)

In [12]:
size=df.shape[0]

In [19]:
ratings = df.rating.values.astype(np.int8)
users = df.user_idx.values.astype(np.int32)
movies = df.movie_idx.values.astype(np.int16)

In [None]:
del df

In [20]:
ratings.dtype,users.dtype,movies.dtype

(dtype('int8'), dtype('int32'), dtype('int16'))

In [21]:
a=ratings
packed = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize, 9, True, cname='zstd')
with pa.OSFile('ratings.dat', 'wb') as f:
    f.write(packed)

In [22]:
a=users
packed = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize, 9, True, cname='zstd')
with pa.OSFile('users.dat', 'wb') as f:
    f.write(packed)

In [23]:
a=movies
packed = blosc.compress_ptr(a.__array_interface__['data'][0], a.size, a.dtype.itemsize, 9, True, cname='zstd')
with pa.OSFile('movies.dat', 'wb') as f:
    f.write(packed)

## Testing blosc uncompressing

In [25]:
size = 100480507
rating_rec = np.empty(size,dtype=np.int8)
movie_rec = np.empty(size,dtype=np.int16)
user_rec = np.empty(size,dtype=np.int32)

In [26]:
rating_file = 'ratings.dat'
users_file = 'users.dat'
movies_file = 'movies.dat'

blosc.decompress_ptr(pa.OSFile(users_file).readall(), user_rec.__array_interface__['data'][0])
blosc.decompress_ptr(pa.OSFile(movies_file).readall(), movie_rec.__array_interface__['data'][0])
blosc.decompress_ptr(pa.OSFile(rating_file).readall(), rating_rec.__array_interface__['data'][0])

100480507

In [29]:
all(rating_rec==ratings)

True

In [30]:
all(user_rec==users)

True

In [31]:
all(movie_rec==movies)

True