# <b><i> Create user oriented table </i></b>

This notebook purpose is to create user-items pivot table used by LightFM and Autorec models

# > Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

# > Preprocessing data

### Load rating data from file

In [3]:
ml_rating_filename = "ratings_re2.csv"
ml_ratings = pd.read_csv(ml_rating_filename, names=["userId", "movieId", "rating", "timestamp"])
ml_ratings = ml_ratings[["userId", "movieId", "rating"]]

### Average duplicate rating value

In [4]:
ml_ratings['rating'] = ml_ratings.groupby(['userId', 'movieId'])['rating'].transform('mean') 
ml_ratings = ml_ratings.drop_duplicates()

### Change pivot to user centric

In [5]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [6]:
n_movie = len(open("moviesIdx2.txt").readlines())

user_c = CategoricalDtype(sorted(ml_ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype([i for i in range(0, n_movie)]) #some movie has no rating at all

In [7]:
# Preparation to make pivot table

row = ml_ratings.userId.astype(user_c).cat.codes
col = ml_ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ml_ratings["rating"], (row, col)),
                           shape=(user_c.categories.size, movie_c.categories.size))

Check sparse_matrix dimension and number of element

In [8]:
sparse_matrix

<138493x15440 sparse matrix of type '<class 'numpy.float64'>'
	with 14094614 stored elements in Compressed Sparse Row format>

### Remove rating bigger than 5

In [9]:
sparse_matrix[sparse_matrix > 5.0] = 0
print("max ratings :", sparse_matrix.max())

max ratings : 5.0


### Save sparse CSR

In [10]:
# Save it
import pickle

filename = "ratings.csr"
pickle.dump(sparse_matrix, open(filename, 'wb'))

In [11]:
print("Preprocessing success!")

Preprocessing success!


---