# BLU11 - Exercises Notebook

In [None]:
import os

import hashlib

import numpy as np
np.seterr(divide='ignore', invalid='ignore')

import scipy as sp
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# 1 About the Data

The data under the `/data/` folder was randomly generated, using the Python `faker` package.

It replicates what we would expect from a real-world dataset, corresponding to an RS to recommend movies to users.

We have three main files: `/data/users.csv`, `/data/items.csv`, and `/data/ratings.csv`.

The files `user.csv` and `items.csv` contain profiles (for users and items, respectively), while ratings have the traditional structure:
* UserID, a `sha256` string identifying the user
* ItemID, a `sha256` string identifying the item
* Rating, with set of possible ratings $S = \{1, 2, 3, 4, 5\}$.
* Timestamp.

User profiles follow the structure: UserID, Username, Name, Sex, Mail and Birthday. 

Item profiles, in line with the example in the learning materials, contain ItemID and Genre.

We build content-based and collaborative filtering pipelines, to provide movie recommendations to users.

# 2 Make Ratings

At the core of any RS is our base model and, with it, the Ratings matrix.

## 1.1 Read Data (graded)

We start by creating all the arrays we need to complete the exercise.

The data is somewhat different this time because user and item ID are both strings.

Although we have all users in the user profiles, we don't readily know what the minimum and maximum values are for items.

For users and items, we want the arrays with all possible values, as well as arrays with the values in ratings and the profiles. 

In [None]:
def make_data(): 
    
    path_users = os.path.join('data', 'user_profiles.csv')
    path_items = os.path.join('data', 'item_profiles.csv')
    path_ratings = os.path.join('data', 'ratings.csv')
    
    # users = read_data(...)
    # users_from_ratings = read_data(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Sort the users array using ndarray.argsort.
    # This will ensure consistency in the order of the rows of the ratings matrix downstream.
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # items_from_ratings = read_data(...)
    # items_from_profiles = read_data(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # ratings = read_data(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # genres = read_data(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return users, users_from_ratings, items_from_ratings, items_from_profiles, ratings, genres


def read_data(path, dtype, column):
    # Use np.genfromtxt to build a general function to read the data into arrays with a single
    # column. You should ignore headers and use the delimiter ','. 
    # The return is a rank-1 array.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
users, users_from_ratings, items_from_ratings, items_from_profiles, ratings, genres = make_data()


assert(users.shape == (1000,))

expected_hash = 'f6a9047f7a89eddeba41b154132d54b7fca78311c7fc783245f5f0ae10b9ec0c'
assert(hashlib.sha256(users[510]).hexdigest() == expected_hash)

assert(users_from_ratings.shape == (5000,))

expected_hash = 'af8577a869a89c1cdbf88936bfc83cb4c5049254dc84bc2f5c34cbfbce0415c2'
assert(hashlib.sha256(users_from_ratings[3433]).hexdigest() == expected_hash)

assert(items_from_ratings.shape == (5000,))

expected_hash = 'c5218decaeea9fe00b0cf56498219f6d99dcfc47a9257bc93614f8df19193c82'
assert(hashlib.sha256(items_from_ratings[3433]).hexdigest() == expected_hash)

assert(ratings.shape == (5000,))

expected_hash = 'f0a0278e4372459cca6159cd5e71cfee638302a7b9ca9b05c34181ac0a65ac5d'
assert(hashlib.sha256(ratings[3433]).hexdigest() == expected_hash)

assert(items_from_profiles.shape == (4900,))

expected_hash = '47fe7fb144b5a61d78b87d9000b400010731dbe9ed417486cb1e89bd02b60015'
assert(hashlib.sha256(items_from_profiles[3340]).hexdigest() == expected_hash)

assert(genres.shape == (4900,))

expected_hash = '85f1c8c8e324b6be99b13732edd1770eb0d200d15becbd659cc47ff5e060ac43'
assert(hashlib.sha256(genres[3340]).hexdigest() == expected_hash)

## 1.2 Make Items (graded)

The set of all *known* items $I$ is given by $I_{ratings} \cup I_{profiles}$, i.e., the union of the items in ratings and item profiles.

In [None]:
def make_items(items_from_ratings, items_from_profiles):
    
    # Use np.concatenate to create a single array with all the items.
    # No asserts depend on the order in which you concatenate the arrays.
    # items =
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Return a sorted array of unique items, in a rank-1 array.
    # Sorting will ensure consistency in the cols of the ratings matrix.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
items = make_items(items_from_ratings, items_from_profiles)


assert(items.shape == (6322,))

expected_hash = 'aeeb2b5edeaeb8343409c3809378ec8a271891b2d9e2334f7853047fb445ba5d'
assert(hashlib.sha256(items[2863]).hexdigest() == expected_hash)

## 1.3 Ratings Matrix (graded)

As always, we make the indispensable user-items ratings matrix.

We are ready to build it, since we have all the users and all the items we need to account for.

We start by building two helper functions, to finding out the row and column indices for each rating.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
rows = make_ratings_rows(users, users_from_ratings)
cols = make_ratings_cols(items, items_from_ratings)


assert(rows.shape == cols.shape == (5000,))

assert(np.all(rows >= 0))
assert(np.all(rows < users.shape[0]))

expected_hash = 'ae86f791757dce0e3800c3803b560df4d3825c2cbf254ef5b9f8ea3bdea8fdcc'
assert(hashlib.sha256(rows[4457]).hexdigest() == expected_hash)

assert(np.all(cols >= 0))
assert(np.all(cols < items.shape[0]))

expected_hash = 'f5c50ec3895168b69eb366805086712f30164e9ff0b0e8a26e109140efc2da6f'
assert(hashlib.sha256(cols[4457]).hexdigest() == expected_hash)

Note that we build the matrix differently, because, unlike in previous examples, both the user and item IDs as *strings*.

Above, we are building the row and column indices from scratch, instead of using user and item IDs like we did in the learning materials.

Also, since we have the complete sets users and items, we can use them to infer the dimensions (number of rows and columns) of the ratings matrix.

(Since, given `users` and `items`, we know *how many* users and items there are in our dataset.)

In [None]:
def make_ratings(users, items, users_from_ratings, items_from_ratings, ratings):
    
    # rows = make_ratings_rows(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # cols = make_ratings_cols(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # nrows = ...
    # ncols = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    shape = (nrows, ncols)
    
    # data = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Construct a COO sparse matrix.
    # coo =
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Return a CSR sparse matrix.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
R = make_ratings(users, items, users_from_ratings, items_from_ratings, ratings)


assert(type(R) == sp.sparse.csr.csr_matrix)
assert(R.shape == (1000, 6322))
assert(R.count_nonzero() == 5000)

# 2 Content-based Recommendations

Now, we move to the pipeline of content-based filtering recommendations.

## 2.1 Make Item Profiles (graded)

The first step, as we've seen in the learning materials, is to build the Item Profiles. Shall we?

Again, we start with the helper function, to generate the rows.

In [None]:
def make_profiles_rows(items, items_from_profiles):
    # Combine a list comprehension with np.argwhere to find the index for each 
    # column in `items_from_profiles` in `items`.
    # Remember, to extract the first element of the resulting array use [0, 0].
    # rows = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    return np.array(rows)

In [None]:
def make_item_profiles(items, items_from_profiles, genres):
    
    # Use np.unique to get unique genres, column indicies and value counts,
    # that we use in the TF-IDF bit.
    # genres_unique, genres_cols, genres_count = ...
    # YOUR CODE HERE
    raise NotImplementedError()
     
    # rows = make_profiles_rows(...)
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # cols = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # nrows = ...
    # ncols = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    shape = (nrows, ncols)
    
    # Use NumPy to compute the Inverse Document Frequency (IDF), as we've 
    # seen in the learning materials.
    # idf = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Use idf to give the right weight to each row, as we've seen in the
    # materials.
    # data = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Construct the sparse matrix as COO.
    # coo = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Return a CSR sparse matrix.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
I = make_item_profiles(items, items_from_profiles, genres)


assert(type(I) == sp.sparse.csr.csr_matrix)
assert(I.shape == (6322, 16))
assert(I.count_nonzero() == 4900)

## 2.2 Profile Learner (graded)

We have successfully built Item Profiles using TF-IDF.

Time to test our algebra skills to uncover User Profiles.

In [None]:
def profile_learner(R, I):
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
U = profile_learner(R, I)


assert(type(U) == sp.sparse.csr.csr_matrix)
assert(U.shape == (1000, 16))
assert(U.count_nonzero() == 2207)

## 2.3 Content-based Prediction (graded)

We generate predictions by computing the similarities between user and item profiles.

In this exercise, we don't want our output to be dense, i.e., we must return a sparse matrix. 

Some functions allow us to pass a `dense_output=False` parameter to ensure the output is sparse if the inputs are sparse.

In [None]:
def predict_content_based(U, I):
    # Return a sparse matrix with similarities.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
R_pred = predict_content_based(U , I)


assert(type(R_pred) == sp.sparse.csr.csr_matrix)
assert(R_pred.shape == (1000, 6322))
assert(R_pred.count_nonzero() == 879378)

## 2.4 Best-item Content-based (graded)

We want to exclude previously rated items and recommend the best match to users.

In [None]:
def best_item_content_based(ratings, preds):
    
    preds_ = preds.copy()
    # Convert preds to a LIL sparse matrix, which is more efficient to
    # change the sparsity structure.
    # preds_ =
    # YOUR CODE HERE
    raise NotImplementedError()
    # Replace the predicted ratings for previous rated items with zero.
    # YOUR CODE HERE
    raise NotImplementedError()
    # Since the changes are done, convert the matrix back to CSR.
    # preds_ = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Return the indeces for the maximum value per row.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
L = best_item_content_based(R, R_pred)


assert(L.shape == (1000, 1))

expected_hash = 'd50dcf8f0079368cc10c2be55ae3fe0c9b30946bda75aacb841cefa0128f7710'
assert(hashlib.sha256(L[665][0, 0]).hexdigest() == expected_hash)

expected_hash = '608b3f640ca82d78dedccd8d8b7ea423dab7b02f1fa007268a02aaf5969bb9e1'
assert(hashlib.sha256(R_pred[665, L[665][0, 0]]).hexdigest() == expected_hash)

# 3 Collaborative-filtering

Using the Ratings matrix, we can also provide collaborative filtering based recommendations.

## 3.1 User Similarities (graded)

We compute the user similarities.

Again, we want out output, i.e., the similarities matrix, to be sparse. We can do it using the `dense_output` parameter.

In [None]:
def user_similarities(ratings):
    # Return a sparce matrix with user-user similarities.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
U_sim = user_similarities(R)


assert(type(U_sim) == sp.sparse.csr.csr_matrix)
assert(U_sim.shape == (1000, 1000))
assert(U_sim.count_nonzero() == 3513)

## 3.2 User-based Predictions (graded)

Based on the user similarities, we compute predictions as a weighted average of the ratings of other users.

(Refer back to the learning materials for the formula.)

In [None]:
def predict_collaborative_filtering_user(ratings, sims):
    # preds = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    # Replace missing values (result from division by zero, btw) with zero.
    # YOUR CODE HERE
    raise NotImplementedError()
    # Return the predictions.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
R_pred_cf_u = predict_collaborative_filtering_user(R, U_sim)


assert(R_pred_cf_u.shape == (1000, 6322))
assert(R_pred_cf_u[R_pred_cf_u.nonzero()].size == 17691)

## 3.4 Item Similarities (graded)

Alternatively, we can do recommendations based on item-item collaborative filtering.

Without surprises, we start by computing item similarities.

In [None]:
def item_similarities(ratings):
    # Return a sparce matrix with item-item similarities.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
I_sim = item_similarities(R)


assert(type(I_sim) == sp.sparse.csr.csr_matrix)
assert(I_sim.shape == (6322, 6322))
assert(I_sim.count_nonzero() == 28667)

## 3.5 Item-based Predictions (graded)

As the last step, we do the predictions, as a weighted average of the ratings of other items.

(Formula can be found in the learning materials.)

In [None]:
def predict_collaborative_filtering_item(ratings, sims):
    # preds = ...
    ### BEGIN SOLUION
    preds = np.dot(ratings, sims) / np.abs(sims).sum(axis=0)
    # Replace missing values with zero.
    # YOUR CODE HERE
    raise NotImplementedError()
    # Return the predictions.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
R_pred_cf_i = predict_collaborative_filtering_item(R, I_sim)


assert(R_pred_cf_i.shape == (1000, 6322))
assert(R_pred_cf_i[R_pred_cf_i.nonzero()].size == 17691)
assert(R_pred_cf_i.min() >= 0)
assert(R_pred_cf_i.max() <= 5)

## 3.6 Top-*N* (graded)

As for the last step, we want a top-*N* list with collaborative filtering recommendations.

In [None]:
def top_n_collaborative_filtering(ratings, preds, n):
    # Replace the predicted ratings for previous rated items with zero.
    # YOUR CODE HERE
    raise NotImplementedError()
    # Return a top-N list by user.
    # YOUR CODE HERE
    raise NotImplementedError()

In [None]:
top_5_cf_u = top_n_collaborative_filtering(R, R_pred_cf_u, 5)
top_5_cf_i = top_n_collaborative_filtering(R, R_pred_cf_i, 5)


assert(top_5_cf_u.shape == top_5_cf_i.shape == (1000, 5))