# Create user oriented table

Change rating data format

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

## Preprocessing data

In [2]:
ml_rating_filename = "ratings_re2.csv"
ml_ratings = pd.read_csv(ml_rating_filename, names=["userId", "movieId", "rating", "timestamp"])
ml_ratings = ml_ratings[["userId", "movieId", "rating"]]

# Groupby -> mean, 
ml_ratings['rating'] = ml_ratings.groupby(['userId', 'movieId'])['rating'].transform('mean') 
ml_ratings = ml_ratings.drop_duplicates()

ml_ratings.to_csv("ratings_re.csv", index=False, header=False)

some duplicate occur !!!! :(

### Change pivot to user centric

In [3]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [4]:
movie_list = open("moviesIdx2.txt").readlines()

In [5]:
user_c = CategoricalDtype(sorted(ml_ratings.userId.unique()), ordered=True)
movie_c = CategoricalDtype(sorted(ml_ratings.movieId.unique()), ordered=True)
# movie_c = CategoricalDtype([i for i in range(0, len(movie_list))]) #some movie has no rating at all

In [6]:
# Preparation to make pivot table

row = ml_ratings.userId.astype(user_c).cat.codes
col = ml_ratings.movieId.astype(movie_c).cat.codes
sparse_matrix = csr_matrix((ml_ratings["rating"], (row, col)),
                           shape=(user_c.categories.size, movie_c.categories.size))

In [7]:
sparse_matrix

<138493x15085 sparse matrix of type '<class 'numpy.float64'>'
	with 14094614 stored elements in Compressed Sparse Row format>

### find list of movie without rating

In [8]:
no_ratings = []

for i in range(0, 15532):
    if i not in movie_c.categories:
        no_ratings.append(i)
        
no_ratings

[44,
 143,
 157,
 198,
 226,
 229,
 236,
 237,
 286,
 348,
 397,
 401,
 467,
 473,
 521,
 535,
 551,
 567,
 574,
 614,
 661,
 666,
 717,
 739,
 755,
 757,
 761,
 770,
 801,
 845,
 849,
 865,
 875,
 918,
 992,
 998,
 1162,
 1166,
 1283,
 1336,
 1372,
 1497,
 1549,
 1562,
 1565,
 1570,
 1582,
 1591,
 1747,
 1764,
 1805,
 1851,
 1859,
 1869,
 1875,
 1887,
 1960,
 1987,
 1997,
 2028,
 2030,
 2133,
 2141,
 2194,
 2212,
 2256,
 2297,
 2377,
 2390,
 2447,
 2484,
 2507,
 2578,
 2607,
 2645,
 2647,
 2677,
 2697,
 2812,
 2856,
 3007,
 3019,
 3039,
 3051,
 3098,
 3351,
 3480,
 3483,
 3539,
 3549,
 3641,
 3668,
 3769,
 3778,
 3923,
 3970,
 4014,
 4038,
 4047,
 4064,
 4090,
 4173,
 4209,
 4246,
 4278,
 4346,
 4347,
 4372,
 4374,
 4451,
 4462,
 4476,
 4497,
 4529,
 4539,
 4548,
 4658,
 4681,
 4711,
 4721,
 4872,
 4880,
 4921,
 5043,
 5225,
 5317,
 5530,
 5570,
 5614,
 5690,
 5754,
 5801,
 5938,
 6014,
 6046,
 6076,
 6218,
 6267,
 6280,
 6351,
 6465,
 6479,
 6485,
 6516,
 6536,
 6624,
 6735,
 6827,
 

### Remove rating > 5

In [9]:
sparse_matrix.max()

5.0

In [10]:
sparse_matrix[sparse_matrix > 5.0] = 0

In [11]:
sparse_matrix.max()

5.0

### Normalize

sparse_matrix = sparse_matrix.multiply(0.2)

## Save sparse CSR

In [12]:
# Save it
import pickle

filename = "ratings.csr"
pickle.dump(sparse_matrix, open(filename, 'wb'))

---

## Make Dataframe version

In [None]:
ratings_pivot = pd.SparseDataFrame(sparse_matrix,
                         index=user_c.categories,
                         columns=movie_c.categories,
                         default_fill_value=0)

In [None]:
ratings_pivot.head()

## Save dataframe

In [None]:
# Save it
import pickle

filename = MOVIELENS_DATASET + "ratings.df"
pickle.dump(ratings_pivot, open(filename, 'wb'))