In [None]:
# Import all the necessary dependencies
import os
import numpy as np
import scipy as sp
import pandas as pd
import scipy.sparse
from mlxtend.frequent_patterns import apriori
import hashlib # for grading purposes

# You will be working with data from an Online Book Store. 

### Every time a customer buys a book, the customer can rate the book and the Book Store uses that data to create recommendations to future customers.

### In this exercise you will have the opportunity to help the Book Store team to choose which books to display in different areas of the website.

## Task 1: Understanding the data

Data is available in the `./data/` folder. In this folder you will find 2 files:

* `BookRatings.csv` has the historical ratings given by the customers and represents all the books sold. 
* `BooksInfo.csv`: has the information about the main genre of the book. 

In [None]:
ratings = pd.read_csv('data/BookRatings.csv')
books_info = pd.read_csv('data/BooksInfo.csv')

Look at the raw files and print out the first rows of each file.

In [None]:
#BookRatings
ratings.head()

In [None]:
#BooksInfo
books_info.head()

### Task 1.1 EDA (ungraded)
- check for Ratings with incomplete data, 
- check for the duplicate records in ratings 
- check for books without Genre

In [None]:
### Your answer

## Task 2: Rating Matrix

### Task 2.1: Create the ratings matrix

In [None]:
def make_ratings(data):
    """
    Parameters
        data - the ratings dataframe with ratings per ISBN and User-ID
        
    Returns:
        R - (numpy.ndarray) Ratings matrix with the User-ID, ISBN and Book-Rating
        hint: don't forget to put zeros in places where you do not have ratings
    
    Extra Hint: Your input is a pandas DataFrame but you want to output an array (use .to_numpy)!
    """

    # YOUR CODE HERE
    raise NotImplementedError()

R = make_ratings(ratings)

f"We have {R.shape[0]} users and {R.shape[1]} items."

In [None]:
expected_hash = '226ef8abe773e3aceec1c057383c1628959c25882846e686412ef7e1ff96873d'
assert hashlib.sha256(str(R.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '0729c13ebd725201c1445a00c825237d305ff650cd72f50e45259bd942a75ef4'
assert hashlib.sha256(str(R[0].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = 'f1e42019aecc858ffbcca7fddec511b761b474916fde37b1a6ff321a9b459330'
assert hashlib.sha256(str(R[:,0].sum()).encode()).hexdigest() == expected_hash_2

### Task 2.2: Convert the Ratings Matrix to a Sparse Representation

In [None]:
from scipy.sparse import csr_matrix

def get_csr(matrix):
    """
    Parameters
        matrix - The Ratings Matrix.
    
    Returns
        H - The Compressed Sparse Row Matrix
        
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
sparse_mat = get_csr(R)

In [None]:
expected_hash = '3068469d4140f3f5fd47d88d14718db567a2ed03bf28240202061d61ea56147c'
assert hashlib.sha256(str(sparse_mat).encode()).hexdigest() == expected_hash

### Task 2.3: Calculate the density score of the matrix

In [None]:
def get_density_score(matrix):
    """
    Parameters
        matrix - Ratings Matrix
        
    Returns:
        dense_score - (float) Density Score of Orig Matrix. 
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    
dense_score = get_density_score(R)
f"The Density Score is {dense_score}."

In [None]:
np.testing.assert_almost_equal(dense_score,0.0004,4)

## Task 3: Non-personalized Recommendations

### Task 3.0: Merge the 2 datasets (rating and books_info)

Merge the dataframes `ratings` and `books_info` in order to have information about the genre of each book. Include only the books that have a rating.

Hint | You might need to use the function <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html">merge()</a>  and explore the parameter 'how'.

In [None]:
def get_book_ratings_df(ratings_, books_info_):
    """
    Parameters
        ratings_ - DataFrame
        books_info_ - DataFrame
        
    Returns:
        book_ratings - DataFrame
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

book_ratings = get_book_ratings_df(ratings, books_info)
book_ratings.head()

In [None]:
expected_hash = 'c1d3dbf9ef7fb86036e5c933ff8de7a66d67b7dd25508764451e3ac8c300f110'
assert hashlib.sha256(str(book_ratings.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '3c4340f3a5aa8a40da4f7a2dc2f3ef4645ba099b58e986d12bd5f65b709efb20'
assert hashlib.sha256(str(book_ratings['Book-Rating'].sum()).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '581cd6bccf7862e391ce07768616c8427d6cf9ddec881f6984e3cbd835379997'
assert hashlib.sha256((book_ratings[(book_ratings['ISBN']=='1558744150')&(book_ratings['User-ID']==48579)].reset_index()['Genre'][0]).encode()).hexdigest() == expected_hash_2

### Task 3.1: The most popular books in the store

The Book Store wants to display on the website a collection of the most popular books in the store. Since we don't have information on purchases we are going to use the ratings.

Create a function that returns a list with the ISBNs of the top N most popular books in the store - the N books that received the most ratings. The values in the list should be ordered from the most popular to the least popular book.

Hint | You might find it useful to use the following functions (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html ">groupby()</a> - to group the data 
- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.size.html">size()</a> -  to get the number of lines


In [None]:
def get_popular_books(df, n):
    """
    Parameters
        df - DataFrame
        n - Integer
        
    Returns:
        top_n_popular_books - list of the top n popular books
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

top_5_popular_books = get_popular_books(ratings, 5)   

In [None]:
expected_hash = 'ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d'
assert hashlib.sha256(str(len(top_5_popular_books)).encode()).hexdigest() == expected_hash

expected_hash_1 = 'ecf0bb677736450811308765d0a80c698603dae939c42388f4f19880fa7dc704'
assert hashlib.sha256(str(top_5_popular_books[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '6cf1c4943f89becc6f4a3d7013d542d14082edcb7038bc38792f2045419a556e'
assert hashlib.sha256(str(top_5_popular_books[4]).encode()).hexdigest() == expected_hash_2


### Task 3.2: The best rated books

The Book Store also wants to display on the website a collection of the books with the best ratings in the store. 

Create a function that returns the top N best rated books with more than k ratings. Use the mean rating of each book for comparison. The list should be ordered from the best to the worst rated book.

In [None]:
def get_topn_rates(data, n, k):
    """
    Parameters
        data - DataFrame with ratings
        n - Top-n books
        k - Minimum number of ratings
        
    Returns
        top_books - List of ISBNs of top-n best mean rated books.
        Only consider books with more than k ratings.
    """
    # YOUR CODE HERE
    raise NotImplementedError()
    
top5_rates = get_topn_rates(ratings, 5, 10)

In [None]:
expected_hash = 'ef2d127de37b942baad06145e54b0c619a1f22327b2ebbcfbec78f5564afe39d'
assert hashlib.sha256(str(len(top5_rates)).encode()).hexdigest() == expected_hash

expected_hash_1 = '176e1ad48051114c46de83e1b5b55bf6bc21dbfce49a62ff352cfdef48ff6357'
assert hashlib.sha256(str(top5_rates[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '20865e898050bb593da47f242377658bf3653fe9931bb645e6b1bbf29440d9f0'
assert hashlib.sha256(str(top5_rates[4]).encode()).hexdigest() == expected_hash_2

### Task 3.3: Loyal customers

The Book Store wants to reward the customers that gave the most ratings on the website. 

Create a function that returns a list of the top N users that gave the most ratings. Order the list by the number of given ratings in descending order.

In [None]:
def get_loyal_customers(df, n):
    """
    Parameters
        df - DataFrame
        n - Integer
        
    Returns:
        top_n_loyal_customers - The top n loyal customers
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

top_10_loyal_customers = get_loyal_customers(ratings, 10)

In [None]:
expected_hash = '4a44dc15364204a80fe80e9039455cc1608281820fe2b24f1e5233ade6af1dd5'
assert hashlib.sha256(str(len(top_10_loyal_customers)).encode()).hexdigest() == expected_hash

expected_hash_1 = 'c182d826ceb2b42f749faf0dd41929c88dff7a57a6000e2e7d16e5229ca6640b'
assert hashlib.sha256(str(top_10_loyal_customers[1]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '4df81fd140c781b33e9adde8d4bc1dbf520c4a2748f31f5abbe04182176580c6'
assert hashlib.sha256(str(top_10_loyal_customers[7]).encode()).hexdigest() == expected_hash_2

### Task 3.4: For which genre did the users give the most ratings?

Hint | You may find the following function useful (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html">sort_values()</a> -  to sort the data by the number of ratings

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

# genre_user_top_rating =  # Assign your solution to this variable 

In [None]:
expected_hash = 'efa9a3729d47c5c47c0c763107f82dbeb8ba63e479274b2661edf418850791fb'
assert hashlib.sha256(str(genre_user_top_rating).encode()).hexdigest() == expected_hash

### Task 3.5: The most popular books by genre

The Book Store wants to display the most popular book in each genre when the customer navigates to the genre tab.

Create a function that returns a dataframe with the most popular book of each genre, judging by the number of ratings each book received. The columns of the dataset should be `Genre`,`count` for the number of ratings of the most popular book and `ISBN`.

Hint | You might find the following function useful (other similar functions are also available):

- <a href="https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html">head()</a> - to get a retricted number of lines per group

In [None]:
def get_top1_popularity_genre(df):
    """
    Parameters
        df - merged DataFrame with book ratings and genres
        
    Returns:
        top_books_genre - DataFrame with three columns: Genre, ISBN, count
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

top_books_genre = get_top1_popularity_genre(book_ratings)

In [None]:
expected_hash = '9a85c6d41062f7ba7fd7c7130eb5975156f0fd04f93d74fc27778a6726d7c1f3'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='Fiction'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash

expected_hash_1 = '481b11af7b7f0cab7895d47507e7d85310dc49d4fc951117abecfbf7e23a28f2'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='poems'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash_1

expected_hash_2 = '3d89b8a0dd59309c672f7a1af89ba217cf9cba6213adecf1906d4f3992a85cc9'
assert hashlib.sha256(str(top_books_genre[top_books_genre['Genre']=='Biography & Autobiography'].reset_index().loc[:,'ISBN'][0]).encode()).hexdigest() == expected_hash_2

### Task 3.6: Top 3 best average rated books by genre

The Book Store also wants to display in the genre tab the "Best books to read". 

Create a function that returns a Dataframe with the top 3 books with the highest average rating in each genre. Don't forget to display the `Genre`, `ISBN` and average `Book-Rating`. Sort the dataframe by `Genre`.

In [None]:
def get_top3_rates_genre(df):
    
    """
    Parameters
        df - merged DataFrame with ratings and genre
              
    Returns
        books - DataFrame sorted by genre with top 3 books with the highest average rating per genre
              - columns: Genre, ISBN, Book-Rating
    """
    
    # YOUR CODE HERE
    raise NotImplementedError()

top3_rates_genre = get_top3_rates_genre(book_ratings)
top3_rates_genre

In [None]:
top3_rates_genre[top3_rates_genre['Genre']=='Fiction']

In [None]:
expected_hash = '250302a44bedd984034e258ba47827a340db357e8553b4d85ff573d894329123'
assert hashlib.sha256(str(top3_rates_genre.shape).encode()).hexdigest() == expected_hash

expected_hash_1 = '1e56c660887ba75c099588c47bf90b565fe315821214b14f1255a73cab988ed5'
assert hashlib.sha256(str(round(top3_rates_genre['Book-Rating'].sum(),0)).encode()).hexdigest() == expected_hash_1

### Task 4: Most common groups of books

The Book Store wants to display groups of books that the users usually rate together.

Create a function that returns the N most frequent sets of M books that the users rate together for a given minimum support, ordered by support. The function should return a dataframe with two columns `support` and `itemsets`. The input of the function is the rating matrix that you created in Task 2.

In [None]:
def get_apriori_booksets(R, min_support=0.003, n=3, m=3):
    
    """
    Parameters
        R - rating matrix
        min_support - minimal support for the itemsets
        n - number of top n itemsets to return
        m - number of items in itemsets
              
    Returns
        booksets - dataframe with the top n itemsets, 
                   with columns support and itemsets,
                   ordered by support in descending order
    """

    
    # YOUR CODE HERE
    raise NotImplementedError()

get_3_booksets = get_apriori_booksets(R, min_support=0.003, n=3, m=3)

In [None]:
assert get_3_booksets.shape[0]==3, 'The returned dataframe does not have the correct shape.'
assert get_3_booksets.shape[1]==2, 'The returned dataframe does not have the correct shape.'

assert 16018 in get_3_booksets.reset_index()['itemsets'][0]
assert 15979 in get_3_booksets.reset_index()['itemsets'][0]
assert 16130 in get_3_booksets.reset_index()['itemsets'][1]