In [2]:
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
cd drive/Shareddrives/Filmes Pra TI - Machine Learning/Clone do Projeto - Luana/machinelearning

/content/drive/Shareddrives/Filmes Pra TI - Machine Learning/Clone do Projeto - Luana/machinelearning


In [5]:
ls

[0m[01;34mdata[0m/    Makefile    README.md    requirements.txt  test_environment.py
[01;34mdocs[0m/    [01;34mmodels[0m/     [01;34mreferences[0m/  setup.py          tox.ini
LICENSE  [01;34mnotebooks[0m/  [01;34mreports[0m/     [01;34msrc[0m/


# Loading reduced train data

In [6]:
folder = 'data/processed/'

In [7]:
train = pd.read_csv(folder +'very_reduced_train_ratings.csv')
val = pd.read_csv(folder +'very_reduced_val_ratings.csv')
test = pd.read_csv(folder +'very_reduced_test_ratings.csv')

In [8]:
train.head()

Unnamed: 0,rating,Liked,movieIndex,userIndex
0,4.5,1,1495,0
1,5.0,1,98,0
2,4.5,1,276,0
3,4.0,1,316,0
4,3.0,1,475,0


## Preprocess Data

converting Pandas dataframe to dict as it works as a lookup table, but even faster.

A rating matrix would have M x N dimension. If we need to look for a rating of user n and movie m, in the worst case it would be O(MN) because we have empty positions.
However, if we use a dictionare instead, we can populate a dict such as {[user n, movie m] : 2.5}, and it will have the size of the amount of ratings.

In fact, we are going to create 3 dictionaries:
- user2movie: given a user as index, returns all movies that it have rated
- movie2user: given a movie as index, returns all users that have rated that movie before
- usermovie2rating: given a movie and a user, returns the rating

In [16]:
from numba.core import types
from numba.typed import Dict
from numba import jit, cuda

In [13]:
types

<module 'numba.core.types' from '/usr/local/lib/python3.7/dist-packages/numba/core/types/__init__.py'>

In [19]:
user2movie = Dict.empty(
    key_type=types.int32,
    value_type=types.int32[:],
)
movie2user = Dict.empty(
    key_type=types.int32,
    value_type=types.int32[:],
)
usermovie2rating = Dict.empty(
    key_type=types.int32,
    value_type=types.float64[:],
)

@jit(nopython=True)
def create_usermovie_dicts(row):

    i = int(row.userIndex)
    j = int(row.movieIndex)

    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)

    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)

    usermovie2rating[(i, j)] = row.rating

In [20]:
_ = train.apply(create_usermovie_dicts, axis=1)

TypingError: ignored

In [17]:
usermovie2rating_val =  Dict()

def create_usermovie_val(row):

    i = int(row.userIndex)
    j = int(row.movieIndex)
    usermovie2rating_val[(i, j)] = row.rating

usermovie2rating_test =  Dict()

def create_usermovie_test(row):

    i = int(row.userIndex)
    j = int(row.movieIndex)
    usermovie2rating_test[(i, j)] = row.rating

In [18]:
_ = val.apply(create_usermovie_val, axis=1)
_ = test.apply(create_usermovie_test, axis=1)

In [19]:
import pickle

with open(folder +'IDS_DICT/user2movie_train.pickle', 'wb') as handle:
    pickle.dump(user2movie, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder +'IDS_DICT/movie2user_train.pickle', 'wb') as handle:
    pickle.dump(movie2user, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder +'IDS_DICT/usermovie2rating_train.pickle', 'wb') as handle:
    pickle.dump(usermovie2rating, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder +'IDS_DICT/usermovie2rating_val.pickle', 'wb') as handle:
    pickle.dump(usermovie2rating_val, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(folder +'IDS_DICT/usermovie2rating_test.pickle', 'wb') as handle:
    pickle.dump(usermovie2rating_test, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open(folder +'IDS_DICT/movies_ids.pickle', 'rb') as handle:
#     b = pickle.load(handle)