# BLU10 - Exercises Notebook

In [None]:
import hashlib

import os
import numpy as np
import time

# 1 Hacker News algorithm

In this exercise we  implement one of the most famous algorithms in the world:
* The Hacker News ranking algorithm.

Excited? Let's get to it. How the algorithm works:

$$Score = \frac{(P-1)}{(T+2)^G}$$

The points of an item are $P$. (We write $P-1$ to negate the submitter's vote.)

$T$ stands for the time since submission, while $G$ stands for gravity and defaults to 1.8.

These algorithms are necessary because they show us how to use context in non-personalized recommendations.

Gravity and time make the algorithm behave in such a way that:
* The score decreases as time since submission increases
* The score decreases faster when we increase gravity.

Such is a strategic decision because content sites such as HN are pressured to serve fresh content at all times.

# 2 Vectorization

You are expected to use vectorization and avoid `for` loops at all times.

If you don't practice now, it will be  harder when facing bigger datasets (i.e., the Hackathon).

# 3 Ratings matrix (not-graded)

The first thing we will do is building the ratings matrix, just like we did in the classes.

In [None]:
def make_ratings():

    ratings = read_ratings()

    users, user_pos = np.unique(ratings[:, 0], return_inverse=True)
    items, item_pos = np.unique(ratings[:, 1], return_inverse=True)

    arr = np.zeros((len(users), len(items)))
    arr[user_pos, item_pos] = ratings[:, 2]

    return arr


def read_ratings():
    path = os.path.join('data', 'ml-latest-small', 'ratings.csv')
    data = np.genfromtxt(path, delimiter=',', skip_header=1, usecols=[0, 1, 2])

    return data


R = make_ratings()
R

So here we are: users as rows, items as columns and ratings as values.

# 4 Votes matrix (graded)

We will adapt the algorithm, so that:

$$Score = \frac{P}{(T+2)^G}$$

But the first challenge we face is that we have ratings, not votes.

Thus, our first goal is to use a treshold, to define positive votes. We will use a rating above 3.

We will adapt our `make_data` function, to create a `make_votes`.

In [None]:
def make_votes():

    ratings = read_ratings()
    
    # Replace ratings < 4 with 0.
    # You need to combine masking, indexing and assignment.
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # users, user_pos = ...
    # items, item_pos = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Initialize the array with zeros, votes = ...
    # Fill in the array , votes = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return votes


P = make_votes()

In [None]:
assert(P.shape == (671, 9066))

expected_hash = '400c52dd5bd0047d64c0582af027b387a7938a64283d0d4f727331140cb6462c'
assert(hashlib.sha256(P[0, 931]).hexdigest() == expected_hash)

expected_hash = '400c52dd5bd0047d64c0582af027b387a7938a64283d0d4f727331140cb6462c'
assert(hashlib.sha256(P[0, 1515]).hexdigest() == expected_hash)

expected_hash = 'af5570f5a1810b7af78caf4bc70a660f0df51e42baf91d4de5b2328de0e83dfc'
assert(hashlib.sha256(P[0, 4546]).hexdigest() == expected_hash)

# 5 Count votes (graded)

Now that we have a proper matrix, we count the votes.

In [None]:
def count_votes(arr):
    # Transform the values into booleans, using comparison.
    # Sum over the rows to count the votes.
    # The expected ouput is a row vector.
    # YOUR CODE HERE
    raise NotImplementedError()


count_votes(P)

In [None]:
votes = count_votes(P)

assert(votes.shape == (1, 9066))

expected_hash = 'b8282e78d87c2a42f7c0f5b543c0d7760d3fe5a3860a25c20ec57ddaa4d80b81'
assert(hashlib.sha256(votes[:, 1]).hexdigest() == expected_hash)

expected_hash = '72976ee8f1497b6bcf1c12b3fdaf0cdc41ceab14646fe0fb49f37895b5889895'
assert(hashlib.sha256(votes[:, 10]).hexdigest() == expected_hash)

expected_hash = 'f090d8def1e8b5306741957d592246cc08a6e5c88bfe9fe501389cb17c35d971'
assert(hashlib.sha256(votes[:, 100]).hexdigest() == expected_hash)

expected_hash = '048db3815473d7aab19caa136e5cd923a3ac45293f3df2aa5a759cfa96c81332'
assert(hashlib.sha256(votes[:, 1000]).hexdigest() == expected_hash)

# 6 Timestamps matrix (graded)

We are done with voting. So far, this was a standard non-personalized approach.

Time to make things fun, and combine voting with context, namely time.

In [None]:
def make_timestamps():

    timestamps = read_timestamps()
    
    # users, user_pos = ...
    # items, item_pos = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Create the matrix of zeros, arr = ...
    # Fill it in, arr = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return arr


def read_timestamps():
    # Make sure you read ratings.csv properly.
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return data


T = make_timestamps()

In [None]:
assert(T.shape == P.shape)

expected_hash = '717372b3a4caf73c8bc3575b77033c9d2dfa2643540b9175f6756b3cb0ad6651'
assert(hashlib.sha256(T[0, 30]).hexdigest() == expected_hash)

expected_hash = '64506e7145f725b946bf27ffec291f67eaab1d05c2f5292835cc0afafee90836'
assert(hashlib.sha256(T[0, 859]).hexdigest() == expected_hash)

expected_hash = '416e4a0846ac491812331fa072aba6cc760912ab94221625a084c5811873a8bf'
assert(hashlib.sha256(T[670, 4597]).hexdigest() == expected_hash)

# 7 Time of publication (graded)

We will consider the time of publication to be the timestamp of the first rating.

In [None]:
def time_of_publication(arr):
    # Use masking and indexing to transform zero into np.NaN.
    # YOUR CODE HERE
    raise NotImplementedError()
    # Use nanmin to extrac the minimum timestamp per item.
    # The result must be a row vector.
    # YOUR CODE HERE
    raise NotImplementedError()


time_of_publication(T)

In [None]:
time_of_pub = time_of_publication(T)

assert(time_of_pub.shape == (1, 9066))

expected_hash = '9be8cdc2a4d84390391ae936510a4ac60b7fd5cf05580283aebdf896dc21e728'
assert(hashlib.sha256(time_of_pub[:, 2]).hexdigest() == expected_hash)

expected_hash = '394da251e3341ab186c257b94534287253cadaf685a0cafaa6e06b4ff1e8fc29'
assert(hashlib.sha256(time_of_pub[:, 20]).hexdigest() == expected_hash)

expected_hash = 'fc48957135a2abe765d889fec7a4dda875484904f7a89356ebaf65028ec3db10'
assert(hashlib.sha256(time_of_pub[:, 200]).hexdigest() == expected_hash)

expected_hash = 'ec8169903098d08dd523c9cadc2d836477b670f241ccd2ab452d1eea88ae54c5'
assert(hashlib.sha256(time_of_pub[:, 2000]).hexdigest() == expected_hash)

# 8 Time since publication (graded)

The next step is to implement the time since publication.

In [None]:
def time_since_publication(arr):
    current_time = 1533391699.2220602
    # Create a new matrix with the time since publication.
    # YOUR CODE HERE
    raise NotImplementedError()

    
time_since_publication(time_of_publication(T))

In [None]:
time_since_pub = time_since_publication(time_of_pub)

assert(time_since_pub.shape == (1, 9066))

expected_hash = '4762f89e5e4b40854635c44597a71df6a2e6c89f1e39604ffbb88ab73993377c'
assert(hashlib.sha256(time_since_pub[:, 2]).hexdigest() == expected_hash)

expected_hash = 'b6c53b01aa1747d036addd9b80ea3f8305cba3d0d8351908dd4804178e4b60f3'
assert(hashlib.sha256(time_since_pub[:, 20]).hexdigest() == expected_hash)

expected_hash = 'a7fd4d854b5f88214f32ff31af9d6666f347976981b3ee5093644e83bdcb4c15'
assert(hashlib.sha256(time_since_pub[:, 200]).hexdigest() == expected_hash)

expected_hash = '3e136ead263ebeb64b51d882453494096d30645316aa08bdcb24942e87c0e32a'
assert(hashlib.sha256(time_since_pub[:, 2000]).hexdigest() == expected_hash)

# 9 Gravity (graded)

Finally, we implement gravity.

In [None]:
def gravity(arr, g=1.8):
    # Return a matrix with valuea + 2 and gravity (see formula).
    # YOUR CODE HERE
    raise NotImplementedError()


gravity(time_since_publication(time_of_publication(T)))

In [None]:
G = gravity(time_since_pub)

assert(G.shape == (1, 9066))

expected_hash = '8b62dbd48bbb7359717410c9fe6f946f31f9aec54dbaca3289ed9b2b398b2f25'
assert(hashlib.sha256(G[:, 3]).hexdigest() == expected_hash)

expected_hash = 'aaf3889f7c3be75463c563c7563f06c82bef0b8d5d459a8c5cd2adf68fcaf4c8'
assert(hashlib.sha256(G[:, 30]).hexdigest() == expected_hash)

expected_hash = '125addaecaf28b3d75c818b5663476b9d8091ef1bae6a3e6e4655a50d169a2de'
assert(hashlib.sha256(G[:, 300]).hexdigest() == expected_hash)

expected_hash = '7350ad251ac369b2278003606237d1e1b24c833826845bec87fb686ba1c20a08'
assert(hashlib.sha256(G[:, 3000]).hexdigest() == expected_hash)

# 10 Predict score (graded)

The key computation in any RS is the prediction bit.

Here, we compute the expected utility of a given item, using the Hacker News formula.

In [None]:
def predict_score():
    
    # Count votes, such as P = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Time since publication, with gravity
    # G = ...
    # YOUR CODE HERE
    raise NotImplementedError()
    
    # Return the required division.
    # YOUR CODE HERE
    raise NotImplementedError()


S = predict_score()
S

In [None]:
assert(S.shape == (1, 9066))

expected_hash = '017426ad32c0bafed33adb8e76265bceb64e9ec7821080b6c21f8ce0bf54f119'
assert(hashlib.sha256(S[:, 4]).hexdigest() == expected_hash)

expected_hash = 'af5570f5a1810b7af78caf4bc70a660f0df51e42baf91d4de5b2328de0e83dfc'
assert(hashlib.sha256(S[:, 40]).hexdigest() == expected_hash)

expected_hash = '4469d9a3985e163aebff3c4ea8152654f3c624a182764e820ed707855b1c9c4c'
assert(hashlib.sha256(S[:, 400]).hexdigest() == expected_hash)

expected_hash = '83bb006b9895552a70c9684e9af8beb1d2a9b2ed2fb193e4d8da190c4adcd41e'
assert(hashlib.sha256(S[:, 4000]).hexdigest() == expected_hash)

# 11 Best-item (graded)

Knowing the predicted utility for each time, we want to find out what is the best item.

In [None]:
def best_item():
    scores = predict_score()
    # Return the index of the best score.
    # YOUR CODE HERE
    raise NotImplementedError()


i = best_item()

In [None]:
expected_hash = '32665ec558675bf4e936b43e29cb00315abd00c3381a887cace8378fefd091c1'
assert(hashlib.sha256(i).hexdigest() == expected_hash)

# 12 Top-*N* (graded)

However, the most common approach to providing recommendations is the top-$N$ list.

The last step is to define a funtion that retuns the top-$N$ items, based on our score. 

In [None]:
def top_n(n):
    scores = predict_score()
    # Return the indexes of the top-N items.
    # YOUR CODE HERE
    raise NotImplementedError()


L = top_n(n=5)
L

In [None]:
expected_hash = 'b8615989e73e34455bd773809690f5c5580eb9ecab36dbb275f2c99f1f3ac15a'
assert(hashlib.sha256(L).hexdigest() == expected_hash)