# BLU10 - Exercises Notebook

In [None]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd

import scipy.sparse

from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

## Q0: Create the ratings matrix (ungraded)

In [None]:
path = os.path.join('data', 'ml-latest-small', 'ratings.csv')
data = pd.read_csv(path)
# Shuffle Data
data = data.sample(10493, random_state=200)
data.head()

So, we'll ignore the _timestamp_ and use the _rating_ column as our only source of information for our recommender system. Remember that if you had data of other types of interactions between users and the movies, you could create a metric that incorporates all of that information in the ratings matrix (maybe by averaging them).

Keep the following ratings matrix schema in your mind while developing non-personalized systems. These systems rely heavily on the ratings matrix, so maybe also write it on a piece of paper to remember it better!

<img align="left" width="413" height="239" src="./media/ratings_matrix3.png">


In the following exercise you will build the ratings matrix with users as rows and products as columns.

Tip: you can use the pandas' _pivot_ function or flex your numpy muscles with the _genfromtxt_ function (it is good for your health!).

In [None]:
def make_ratings(data: pd.DataFrame):
    """
    Parameters
        data - the original data with ratings per user and product.
        
    Returns:
        R - (numpy.ndarray) Ratings matrix with the userId, movieId and rating
        hint: don't forget to put zeros on places where you do not have ratings
    
    Extra Hint: Your input is a pandas DataFrame but you want to output an array (use .to_numpy)!
    """
    # YOUR CODE HERE
    
    # YOUR CODE HERE
    raise NotImplementedError()
    

R = make_ratings(data)
R

In [None]:
f"We have {R.shape[0]} user and {R.shape[1]} items."

In [None]:
expected_hash = '0825c15053e635376af0a569e8f37cfaef0e1dfce37ae6878517e14e061f13c4'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '8ab31b5afaea56114427e1f01b81d001b079a0f59539f6db3f099816ca794055'
assert hashlib.sha256(str(R[0].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = 'b5967724d1225caa9c6af28a9b333a29e6d5c11a24e9d381acf5c3377524b776'
assert hashlib.sha256(str(R[:,0].sum()).encode()).hexdigest() == expected_hash_2



## Q1: Convert the Ratings Matrix to a Sparse Representation

In this exercise, convert the ratings matrix to a sparse representation - use any method you want. 

Hint: Remember what we have done with scipy library!

In [None]:
def get_csr(R):
    """
    Parameters
        R - The original Ratings Matrix.
    
    Returns
        H_ - The Compressed Sparse Row Matrix
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    
H_ = get_csr(R)

In [None]:
expected_hash = 'b265829173d9fab7c76766a5e089cf84879d0bdcbf61aa51146afd659b291fc3'
assert hashlib.sha256(str(H_).encode()).hexdigest() == expected_hash

## Q2: What is the space efficiency of converting to sparse?

In this exercise, let's understand how much space we save (this is, the percentage of rows that are zero in the original rating matrix).

Calculate that % in the get sparsity score function below:

In [None]:
def get_sparsity_score(R):
    """
    Parameters
        R - Ratings Matrix
        
    Returns:
        sparsity_score - (float) Sparsity Score of R. In percentage, rounded up to 2 decimal places.
    """
    # YOUR CODE HERE
    # YOUR CODE HERE
    raise NotImplementedError()
    
sparsity_score = get_sparsity_score(R)

In [None]:
expected_hash = '19385d6767a3ff1e11ffa2a5521c3b1a5dfcb0cfd6cdb9a172155e690f9404d5'
assert hashlib.sha256(str(sparsity_score).encode()).hexdigest() == expected_hash

In [None]:
f"The Sparsity Score is {sparsity_score}%."

## Q3: Popular Items - What are the Top-3 Most Rated items?
More ratings give us the current trends but not necessarily the best suggestions - but let's check Items that have more ratings given.

In this exercise you will have to retrieve the indexes of the products so you may need to recreate the ratings matrix as a dataframe or come up with another creative solution!

**Hint: To get the ID's it's easier if you work with data frames as rating matrixes!**

In [None]:
def top3items(R, n=3):
    """
    Parameters
        R - Ratings matrix
        n - Number of Top-n items to retrieve
        
    Returns
        most_rated - (list) list of product ids of  
        the top-n most rated items
    """
    # YOUR CODE HERE
    # YOUR CODE HERE
    raise NotImplementedError()

most_wanted = top3items(R)

In [None]:
expected_hash = '3f54e8a66e4924557721acdcb1d40e4399376f780ce6d2b11f48931fc5e1376f'
assert hashlib.sha256(str(most_wanted).encode()).hexdigest() == expected_hash

## Q4: Influencers - What are the Top-5 Most Active Users?

In [None]:
def get_influencers(R, n=5):
    """
    Parameters
        R - Ratings matrix
        n - number of top-n most active users
        
    Returns
        influencers - (list) list of 
        ids of the top-n most active users
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    return top_ids

influencers = get_influencers(R)

In [None]:
expected_hash = '1879949fb971e59003fafcd4ee77e7a49d4e103d630b4fa2b38ce4268634f1e7'
assert hashlib.sha256(str(influencers).encode()).hexdigest() == expected_hash

## Q5: Elite - What are the Top-7 Better Rated Items (On Average)?

Since this can be biased by a low number of ratings, we need items to have at least 10 ratings. Use average to obtain the ids of the top average rated products.

Hint: In this exercise and to filter the movies by rating, it may be easier to use the original data and then reconstruct the ratings matrix!

In [None]:
def elite(data, n=7, k=10):
    """
    Parameters
        data - The original ratings.
        n - Top-n items
        k - Mininum number of ratings
        
    Returns
        best_items - (numpy.ndarray) array for top-n best mean rated items.
                     Your indices should refer only to items with more than k ratings (subset of original matrix).
    """
    
    # YOUR CODE HERE
    # YOUR CODE HERE
    raise NotImplementedError()

    
best_items = elite(data)

In [None]:
expected_hash = '0ed83bd0067fb63b2c591d0039829a3a9bc6887a7bb8f076bc15f6e6944e9216'
assert hashlib.sha256(str(best_items).encode()).hexdigest() == expected_hash

## Q6: Apriori - What are the 4 most common 2-piece itemsets?
We define "common itemsets" as at least 2 different items that are usually rated together at least by 0.5% of the population (erheeem support!).
Show your results sorted by support in descending way.

Hint: Check the mlxtend documentation for help: http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/

In [None]:
def getBundlesSolution(data, n=None, min_support=None, top=None):
    """
    Parameters
        R: Ratings Matrix
        n: Number of items in commonset
        min_support: Minimum percentage of users that contains the itemset
        top: Number of most common itemsets
        
    Return
        df: the return dataframe should have two columns ["support", "itemsets"],
            with the support percentage and the itemsets.
    """
    # YOUR CODE HERE
    # YOUR CODE HERE
    raise NotImplementedError()
    

df = getBundlesSolution(data, n=2, min_support=0.005, top=5)

In [None]:
expected_hash = 'f7441550a0ca5274581d023417c99540e3a8a4cca68824a87cbe6d95c07742ea'
assert hashlib.sha256(str(df.shape).encode()).hexdigest() == expected_hash

expected_hash = '9278d41b216e936c8ac0bcb11af99bad893ebfd9679886766e8dba86e3d335cf'
assert hashlib.sha256(str(df.iloc[0,1]).encode()).hexdigest() == expected_hash