# BLU10 - Exercises Notebook

In [None]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd

import scipy.sparse

from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

## Q0: Create the ratings matrix (ungraded)

In [None]:
def read_data(path=None):
    """
    Parameters:
        path - filepath for the original file
        
    Returns
        data - (numpy.ndarray) array with the userId, movieId and rating
    """
    # YOUR CODE HERE
    raise NotImplementedError()

data = read_data()
data

In [None]:
def make_ratings(data):
    """
    Parameters
        data - data originated from the function read_data
        
    Returns
        R - Ratings matrix
    """
    # YOUR CODE HERE
    raise NotImplementedError()

R = make_ratings(data)
R

In [None]:
f"We have {R.shape[0]} user and {R.shape[1]} items."

In [None]:
expected_hash = '3e68e59862d4b598351ba37b6103d22d125e0ec19f8c91a97188ffc78e156137'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

## Q1: Convert the Ratings Matrix to a Sparse Representation

In [None]:
def get_csr(R):
    """
    Parameters
        R - Ratings Matrix
    
    Returns
        H_ - Compressed Sparse Row Matrix
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    
H_ = get_csr(R)

In [None]:
expected_hash = '1d6952c195999c1f9de9e2e0f47e3c748cd784aa9578645e656f34ec6d546170'
assert hashlib.sha256(str(H_).encode()).hexdigest() == expected_hash

## Q2: What is the space efficiency of converting to sparse
In percentage, rounded up to 2 decimal places.

In [None]:
def get_sparsity_score(R):
    """
    Parameters
        R - Ratings Matrix
        
    Returns:
        sparsity_score - (float) Sparsity Score of R. In percentage, rounded up to 2 decimal places.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    
sparsity_score = get_sparsity_score(R)

In [None]:
expected_hash = '41a05f83a0b06fc85074bed9caee5fcf753ffde064f211c8ff11f1a29edd2b72'
assert hashlib.sha256(str(sparsity_score).encode()).hexdigest() == expected_hash

In [None]:
f"The Sparsity Score is {sparsity_score}%."

In [None]:
def get_sparsity_representation_savings(H_, R):
    """
    Compare the number of non-zero values in the sparse matrix with the total number of elements in the original matrix.
    
    Parameters
        H_ - CSR Matrix
        R  - Original ratings matrix
        
    Returns
        savings - (float) percentage of CSR size in relation to original ratings matrix, rounded up to 2 decimal places.
    """
    # YOUR CODE HERE
    raise NotImplementedError()

sparsity_representation_savings = get_sparsity_representation_savings(H_, R)

In [None]:
expected_hash = '41a05f83a0b06fc85074bed9caee5fcf753ffde064f211c8ff11f1a29edd2b72'
assert hashlib.sha256(str(sparsity_representation_savings).encode()).hexdigest() == expected_hash

In [None]:
f"The Sparse Representation size is {sparsity_representation_savings}% of the original matrix."

In [None]:
# Just to make sure that the dense representation of the sparse matches the original ratings matrix
assert H_.todense().shape == R.shape

## Q3: Popular Items - What are the Top-3 Most Rated items?
More ratings give us the current trends but not necessarily the best suggestions.

In [None]:
def top3items(R, n=3):
    """
    Parameters
        R - Ratings parameter
        n - Number of Top-n items to retrieve
        
    Returns
        most_rated - (numpy.ndarray) array for top-n most rated items
    """
    # YOUR CODE HERE
    raise NotImplementedError()

most_wanted = top3items(R)

In [None]:
expected_hash = '41f87af66749260f2d7a04703f1069a37e58135d2e9f43e4c1f94e026cff1117'
assert hashlib.sha256(str(most_wanted).encode()).hexdigest() == expected_hash

## Q4: Influencers - What are the Top-5 Most Active Users?

In [None]:
def get_influencers(R, n=5):
    """
    Parameters
        R - Ratings parameter
        n - number of top-n most active users
        
    Returns
        influencers - (numpy.ndarray) array for top-n most active users
    """
    # YOUR CODE HERE
    raise NotImplementedError()

influencers = get_influencers(R)

In [None]:
expected_hash = 'e2440e4dd3a85b37a5401b3d4e5c437aaa26de0620f19178bc782c82dd4c3432'
assert hashlib.sha256(str(influencers).encode()).hexdigest() == expected_hash

## Q5: Elite - What are the Top-7 Better Rated Items?
Since this can be biased by a low number of ratings, we need items to have at least 10 ratings.

In [None]:
def elite(R, n=7, k=10):
    """
    Parameters
        R - Ratings Matrix
        n - Top-n items
        k - Mininum number of ratings
        
    Returns
        best_items - (numpy.ndarray) array for top-n best mean rated items.
                     Your indices should refer only to items with more than k ratings (subset of original matrix).
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

    
best_items = elite(R)

In [None]:
expected_hash = '25af7ec79f15bf252f96440f68995619f7b85bdfc1f89d69a227cd9d4cf80e7d'
assert hashlib.sha256(str(best_items).encode()).hexdigest() == expected_hash

## Q6: Apriori - What are the 5 most common 3-piece itemsets?
We define "common itemsets" as at least 3 different items that are usually bought together at least by 20% of the population.
Show your results sorted by support in descending way.

In [None]:
def getBundlesSolution(R, n=None, min_support=None, top=None):
    """
    Parameters
        R: Ratings Matrix
        n: Number of items in commonset
        min_support: Minimum percentage of users that contains the itemset
        top: Number of most common itemsets
        
    Return
        df: the return dataframe should have two columns ["support", "itemsets"],
            with the support percentage and the itemsets.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    

df = getBundlesSolution(R, n=3, min_support=0.2, top=5)

In [None]:
expected_hash = 'f7441550a0ca5274581d023417c99540e3a8a4cca68824a87cbe6d95c07742ea'
assert hashlib.sha256(str(df.shape).encode()).hexdigest() == expected_hash

expected_hash = '62572327fb0a94f45a9bc8604ccb94f8925a6883b7fb5cd4786c736824ccb8b8'
assert hashlib.sha256(str(df.iloc[0,1]).encode()).hexdigest() == expected_hash

np.testing.assert_approx_equal(df.iloc[2, 0], 0.253353, significant=4)