In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Loading data

In [2]:
data = pd.read_csv( r"D:\\data\\ratings.csv")
data = data[['userId', 'movieId', 'rating']]  # to remove timestamp column
data.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
user_ids = sorted(set(data['userId']))    #set---> no duplication
movie_ids = sorted(set(data['movieId']))
n_users = len(user_ids)
n_movies = len(movie_ids)

print("number of users: {}\nnumber of movies: {}".format(n_users, n_movies))

number of users: 610
number of movies: 9724


In [4]:
vector_sizes = data.groupby('movieId')['userId'].nunique().sort_values(ascending=False)
print(vector_sizes.head())
print('on average, each movie is rated {} times'.format(vector_sizes.mean()))

movieId
356     329
318     317
296     307
593     279
2571    278
Name: userId, dtype: int64
on average, each movie is rated 10.369806663924312 times


# Mean centering

In [5]:
user_group = data.groupby(by='userId')
user_means = user_group['rating'].agg(['mean', 'count'])

In [6]:
mean_centering = lambda ratings: ratings - ratings.mean()
data['meanCenteredRating'] = user_group['rating'].transform(mean_centering)
data.head()

Unnamed: 0,userId,movieId,rating,meanCenteredRating
0,1,1,4.0,-0.366379
1,1,3,4.0,-0.366379
2,1,6,4.0,-0.366379
3,1,47,5.0,0.633621
4,1,50,5.0,0.633621


In [7]:
user_means.head()

Unnamed: 0_level_0,mean,count
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.366379,232
2,3.948276,29
3,2.435897,39
4,3.555556,216
5,3.636364,44


# Splitting data

In [8]:
data_train, data_test = train_test_split(data, test_size=0.05, random_state=42)

In [9]:
# build userId to row mapping dictionary
user2row = dict()
row2user = dict()
for i, user_id in enumerate(user_ids):
    user2row[user_id] = i
    row2user[i] = user_id

# build movieId to column mapping dictionary
movie2col = dict()
col2movie = dict()
for i, movie_id in enumerate(movie_ids):
    movie2col[movie_id] = i
    col2movie[i] = movie_id

In [10]:
# turn ratings data in table format into a user-item rating matrix
def data_to_matrix(data):
    mat = np.full((n_users, n_movies), np.nan, dtype=np.float32)
    for idx, row in data.iterrows():
        mat[user2row[row['userId']], movie2col[row['movieId']]] = row['meanCenteredRating']
    return mat

train_ratings = data_to_matrix(data_train)

# Compute similarity matrix

In [34]:
# create a blank similarity matrix containing zeros
sim_matrix = np.empty((n_movies, n_movies), dtype=np.float32)
sim_matrix.shape

(9724, 9724)

In [35]:
# remove co-elements from 2 vectors if at least one of them is NaN
def remove_nans(a, b):
    a = a[..., np.newaxis]
    b = b[..., np.newaxis]
    concat = np.concatenate([a, b], axis=1)
    nonan = concat[~np.isnan(concat).any(axis=1)]
    return nonan[:, 0], nonan[:, 1]

In [36]:
# calculate a similarity value given 2 vectors
def calsim(item1, item2, min_co_elements=1):
    item1, item2 = remove_nans(item1, item2)
    if item1.size == 0 or item1.size < min_co_elements: # item1 and item2 must have the same size at this point
        return np.nan
    dot = item1.dot(item2)
    norm1 = np.linalg.norm(item1)
    norm2 = np.linalg.norm(item2)
    return dot / (norm1 * norm2)

In [37]:
# either load or run the next cell to compute similarity matrix
sim_matrix = np.load('sim_matrix.npy')

In [38]:
# calculate all the similarities
# for item1 in range(n_movies):
#     item1vector = train_ratings[:, item1]
#     for item2 in range(item1, n_movies):
#         item2vector = train_ratings[:, item2]
#         sim = calsim(item1vector, item2vector, min_co_elements=2)
#         sim_matrix[item1, item2] = sim
#         sim_matrix[item2, item1] = sim
#     if (item1+1) % 50 == 0 or item1+1 == n_movies:
#         print("Progress: {}/{} ({:.2f} %) items calculated".format(item1+1, n_movies, (item1+1)*100/n_movies))

In [39]:
# this sim matrix takes a lot of time to compute,
# so saving it to the disk will help saving time in the future
np.save('sim_matrix', sim_matrix)

In [40]:
print('fractions of similarity matrix that are NaN:', np.isnan(sim_matrix).mean())

fractions of similarity matrix that are NaN: 0.03304207225977497


# Recommendation

In [41]:
# define a predict function which receives row and column in the ratings matrix
def predict(ratings, user_item, sim_threshold, debug=True):
    desired_user, desired_item = user_item
    rating_sum = 0.
    total_sim = 0.
    for item in range(ratings.shape[1]):
        s = sim_matrix[item, desired_item]
        rating = ratings[desired_user, item]
        if np.isnan(s) or s < sim_threshold or item == desired_item or np.isnan(rating):
            continue
        rating_sum += s * rating
        total_sim += s
        if debug:
            print('sim and rating of item {}:'.format(item), s, rating)
    return rating_sum / total_sim if total_sim else np.nan

In [42]:
predict(train_ratings, (0, 30), sim_threshold = 0.), train_ratings[0, 30]

sim and rating of item 130: 0.15915701 0.6336207
sim and rating of item 136: 0.00030422444 0.6336207
sim and rating of item 201: 0.06322966 -0.36637932
sim and rating of item 291: 0.012006057 0.6336207
sim and rating of item 320: 0.07370741 0.6336207
sim and rating of item 325: 0.026950352 -0.36637932
sim and rating of item 367: 0.051242784 -1.3663793
sim and rating of item 436: 0.057251804 -1.3663793
sim and rating of item 484: 0.043667253 -0.36637932
sim and rating of item 508: 0.0005498808 -0.36637932
sim and rating of item 546: 0.0958714 -1.3663793
sim and rating of item 559: 0.109139256 -1.3663793
sim and rating of item 594: 0.0714052 -1.3663793
sim and rating of item 615: 0.020978423 -1.3663793
sim and rating of item 632: 0.013212537 -0.36637932
sim and rating of item 701: 0.00033464318 0.6336207
sim and rating of item 720: 0.04851188 0.6336207
sim and rating of item 767: 0.08294536 -1.3663793
sim and rating of item 781: 0.0360465 0.6336207
sim and rating of item 786: 0.18401113 

(-0.43642004626075936, nan)

In [43]:
movie_df =pd.read_csv( r"D:\\data\\movies.csv")
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [44]:
def recommend(ratings, desired_user, sim_threshold):
    scores = []
    for item in range(ratings.shape[1]):
        score = ratings[desired_user, item]
        if np.isnan(score):
            score = predict(ratings, (desired_user, item), sim_threshold, debug=False)
        else:
            score = -np.infty # we don't want to recommend movies that user have rated
        scores.append(score)
    scores = np.array(scores)
    scores_argsort = np.argsort(scores)[::-1]
    scores_sort = np.sort(scores)[::-1]
    
    no_of_nan = np.count_nonzero(np.isnan(scores))
    scores_argsort = np.roll(scores_argsort, -no_of_nan)
    scores_sort = np.roll(scores_sort, -no_of_nan)
    return scores_argsort, scores_sort

def recommend_msg(user_row, scores_argsort, scores_sort, how_many=10):
    m = user_means.loc[row2user[user_row]]['mean']
    print('User mean rating:', m)
    msg = pd.DataFrame(columns=['movieId', 'title', 'genres', 'rating'])
    for i in range(how_many):
        col = scores_argsort[i]
        movie_id = col2movie[col]
        movie = movie_df.loc[movie_df['movieId'] == movie_id].iloc[0]
        msg.loc[i+1] = [movie_id, movie['title'], movie['genres'], scores_sort[i] + m]
    msg['movieId'] = msg['movieId'].astype(np.int32)
    return msg

In [None]:
scores_argsort, scores_sort = recommend(train_ratings, 0, 0.)

In [None]:
scores_argsort, scores_sort

In [None]:
recommend_msg(20, scores_argsort, scores_sort, how_many=10)