# Imports

In [1]:
import sys

import pandas as pd
import numpy as np

sys.path.append("..")

from constants import UDATA_PATH, NUM_RATINGS, NUM_USERS, RATINGS_PATH

# Data Loading

In [2]:
udata = pd.read_csv(UDATA_PATH, sep="\t", header=None)
udata.columns = ["user_id", "item_id", "rating", "timestamp"]
udata.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Let's convert data to matrix leaving only `user_id`, `item_id`, `rating` columns

In [3]:
udata = np.array(udata)
udata = udata[:, :3]
udata[:5]

array([[196, 242,   3],
       [186, 302,   3],
       [ 22, 377,   1],
       [244,  51,   2],
       [166, 346,   1]], dtype=int64)

# Data convertion

Let's now create ratings table for `Colaborative Filtering method`

It will have the following structure:

| | film_1| film_2 |
|:-:|:-:|:-:|
|user_1|3|5|
|user_2|0|4|

In our case zero means tha there is no rate from user

In [4]:
ratings = np.zeros((NUM_USERS + 1, NUM_RATINGS + 1))

for user_id, film_id, rating in udata:
    ratings[user_id][film_id] = rating

ratings[:10, :10]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 5., 3., 4., 3., 3., 5., 4., 1., 5.],
       [0., 4., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 4., 3., 0., 0., 0., 0., 0., 0., 0.],
       [0., 4., 0., 0., 0., 0., 0., 2., 4., 4.],
       [0., 0., 0., 0., 5., 0., 0., 5., 5., 5.],
       [0., 0., 0., 0., 0., 0., 0., 3., 0., 0.],
       [0., 0., 0., 0., 0., 0., 5., 4., 0., 0.]])

# Save ratings table

In [5]:
np.save(RATINGS_PATH, ratings)