# How to run this program

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Loading data

In [2]:
data = pd.read_csv( r"D:\\data\\ratings.csv")
data = data[['userId', 'movieId', 'rating']]  # to remove timestamp column
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
user_ids = sorted(set(data['userId']))    #set---> no duplication
movie_ids = sorted(set(data['movieId']))
n_users = len(user_ids)
n_movies = len(movie_ids)

print("number of users: {}\nnumber of movies: {}".format(n_users, n_movies))

number of users: 610
number of movies: 9724


In [4]:
vector_sizes = data.groupby('movieId')['userId'].nunique().sort_values(ascending=False)
print(vector_sizes.head())
print('on average, each movie is rated {} times'.format(vector_sizes.mean()))

movieId
356     329
318     317
296     307
593     279
2571    278
Name: userId, dtype: int64
on average, each movie is rated 10.369806663924312 times


# Mean centering

In [5]:
user_group = data.groupby(by='userId')
user_means = user_group['rating'].agg(['mean', 'count'])

In [6]:
mean_centering = lambda ratings: ratings - ratings.mean()
data['meanCenteredRating'] = user_group['rating'].transform(mean_centering)
data.head()

Unnamed: 0,userId,movieId,rating,meanCenteredRating
0,1,1,4.0,-0.366379
1,1,3,4.0,-0.366379
2,1,6,4.0,-0.366379
3,1,47,5.0,0.633621
4,1,50,5.0,0.633621


In [7]:
user_means.head()

Unnamed: 0_level_0,mean,count
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.366379,232
2,3.948276,29
3,2.435897,39
4,3.555556,216
5,3.636364,44


# Splitting data

In [8]:
data_train, data_test = train_test_split(data, test_size=0.05, random_state=42)

In [9]:
# build userId to row mapping dictionary
user2row = dict()
row2user = dict()
for i, user_id in enumerate(user_ids):
    user2row[user_id] = i
    row2user[i] = user_id

# build movieId to column mapping dictionary
movie2col = dict()
col2movie = dict()
for i, movie_id in enumerate(movie_ids):
    movie2col[movie_id] = i
    col2movie[i] = movie_id

In [10]:
# turn ratings data in table format into a user-item rating matrix
def data_to_matrix(data):
    mat = np.full((n_users, n_movies), np.nan, dtype=np.float32)
    for idx, row in data.iterrows():
        mat[user2row[row['userId']], movie2col[row['movieId']]] = row['meanCenteredRating']
    return mat

train_ratings = data_to_matrix(data_train)