# Part-Pre. Preparation 

## Pre 1. Setup Block


In [3]:
!rm -rf ratings* books* to_read* test*

!curl -o ratings.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-ratings.csv" 
!curl -o books.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-books.csv"
!curl -o to_read.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-to_read.csv"
!curl -o test.csv "http://www.dcs.gla.ac.uk/~craigm/recsysH/coursework/final-test.csv"

'rm' is not recognized as an internal or external command,
operable program or batch file.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 33 7631k   33 2587k    0     0  5033k      0  0:00:01 --:--:--  0:00:01 5023k
100 7631k  100 7631k    0     0  8516k      0 --:--:-- --:--:-- --:--:-- 8516k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2366k  100 2366k    0     0  6130k      0 --:--:-- --:--:-- --:--:-- 6130k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      

In [1]:
#Standard setup
import pandas as pd
import numpy as np
import torch
!pip install git+https://github.com/cmacdonald/spotlight.git@master#egg=spotlight
from spotlight.interactions import Interactions
SEED=20
BPRMF=None



## Pre 2. Data Preparation

Let's load the dataset into dataframes.

In [14]:
#load in the csv files
ratings_df = pd.read_csv("ratings.csv")
books_df = pd.read_csv("books.csv")
to_read_df = pd.read_csv("to_read.csv")
test = pd.read_csv("test.csv")

In [15]:
to_read_df

Unnamed: 0.1,Unnamed: 0,user_id,book_id
0,560054,2278,4232
1,277900,2118,298
2,87083,2769,3934
3,77727,2540,293
4,240676,3142,147
...,...,...,...
489415,495742,694,3
489416,511918,3489,61
489417,151565,4296,316
489418,248285,2655,46


In [16]:
#cut down the number of items and users
counts=ratings_df[ratings_df["book_id"] < 2000].groupby(["book_id"]).count().reset_index()
valid_books=counts[counts["user_id"] >= 10][["book_id"]]

books_df = books_df.merge(valid_books, on="book_id")
ratings_df = ratings_df[ratings_df["user_id"] < 2000].merge(valid_books, on="book_id")
to_read_df = to_read_df[to_read_df["user_id"] < 2000].merge(valid_books, on="book_id")
test = test[test["user_id"] < 2000].merge(valid_books, on="book_id")


#stringify the id columns
def str_col(df):
  if "user_id" in df.columns:
    df["user_id"] = "u" + df.user_id.astype(str)
  if "book_id" in df.columns:
    df["book_id"] = "b" + df.book_id.astype(str)

str_col(books_df)
str_col(ratings_df)
str_col(to_read_df)
str_col(test)

In [18]:
from collections import defaultdict
from itertools import count

from spotlight.cross_validation import random_train_test_split

iid_map = defaultdict(count().__next__)


rating_iids = np.array([iid_map[iid] for iid in ratings_df["book_id"].values], dtype = np.int32)
test_iids = np.array([iid_map[iid] for iid in test["book_id"].values], dtype = np.int32)
toread_iids = np.array([iid_map[iid] for iid in to_read_df["book_id"].values], dtype = np.int32)


uid_map = defaultdict(count().__next__)
test_uids = np.array([uid_map[uid] for uid in test["user_id"].values], dtype = np.int32)
rating_uids = np.array([uid_map[uid] for uid in ratings_df["user_id"].values], dtype = np.int32)
toread_uids = np.array([uid_map[iid] for iid in to_read_df["user_id"].values], dtype = np.int32)


uid_rev_map = {v: k for k, v in uid_map.items()}
iid_rev_map = {v: k for k, v in iid_map.items()}


rating_dataset = Interactions(user_ids=rating_uids,
                               item_ids=rating_iids,
                               ratings=ratings_df["rating"].values,
                               num_users=len(uid_rev_map),
                               num_items=len(iid_rev_map))

toread_dataset = Interactions(user_ids=toread_uids,
                               item_ids=toread_iids,
                               num_users=len(uid_rev_map),
                               num_items=len(iid_rev_map))

test_dataset = Interactions(user_ids=test_uids,
                               item_ids=test_iids,
                               num_users=len(uid_rev_map),
                               num_items=len(iid_rev_map))

print(rating_dataset)
print(toread_dataset)
print(test_dataset)

# define the validation set
toread_dataset_train, validation = random_train_test_split(toread_dataset, random_state=np.random.RandomState(SEED))

num_items = test_dataset.num_items
num_users = test_dataset.num_users

<Interactions dataset (1999 users x 1826 items x 124762 interactions)>
<Interactions dataset (1999 users x 1826 items x 135615 interactions)>
<Interactions dataset (1999 users x 1826 items x 33917 interactions)>


In [26]:
def getAuthorTitle(iid):
  bookid = iid_rev_map[iid]
  row = books_df[books_df.book_id == bookid]
  return row.iloc[0]["authors"] + " / " + row.iloc[0]["title"]

print("iid 0: " + getAuthorTitle(0) )

iid 0: Carlos Ruiz Zafón, Lucia Graves / The Shadow of the Wind (The Cemetery of Forgotten Books,  #1)


## Pre 3. Example Code


In [27]:
from spotlight.evaluation import mrr_score, precision_recall_score

class dummymodel:
  
  def __init__(self, numitems):
    self.predictions=np.zeros(numitems)
  
  #uid is the user we are requesting recommendations for;
  #returns an array of scores, one for each item
  def predict(self, uid):
    #this model returns all zeros, regardless of userid
    return( self.predictions )

#lets evaluate how the effeciveness of dummymodel

print(mrr_score(dummymodel(num_items), test_dataset, train=rating_dataset, k=100).mean())
#as expected, a recommendation model that gives 0 scores for all items obtains a MRR score of 0

0.0


In [28]:
#note that mrr_score() displays a progress bar if you set verbose=True
print(mrr_score(dummymodel(num_items), test_dataset, train=rating_dataset, k=100, verbose=True).mean())


1999it [00:00, 4680.53it/s]

0.0





# Part-A. Combination of Recommendation Models

## 1. Explicit & Implicit Matrix Factorisation Models

Create and train three matrix factorisation systems:
 - "EMF": explicit MF, trained on the ratings Interactions object (`rating_dataset`)
 - "IMF": implicit MF, trained on the toread_dataset Interactions object (`toread_dataset_train`)
 - "BPRMF": implicit MF with the BPR loss function (`loss='bpr'`), trained on the toread_dataset Interactions object (`toread_dataset_train`)
  
In all cases, use the standard initialisation arguments,
`n_iter=10, embedding_dim=32, use_cuda=False, random_state=np.random.RandomState(SEED)`.
 
Evaluate each of these models in terms of Mean Reciprocal Rank on the test set.

In [29]:
# Add your solution here

#solution goes here

from spotlight.factorization.explicit import ExplicitFactorizationModel
from spotlight.factorization.implicit import ImplicitFactorizationModel
import time  

EMF = ExplicitFactorizationModel(n_iter=10, 
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False,
                                    random_state=np.random.RandomState(SEED) # ensure results are repeatable
)
current = time.time()

EMF.fit(rating_dataset, verbose=True)
end = time.time()
diff = end - current
print("Training took %d seconds" % (diff))

IMF = ImplicitFactorizationModel(n_iter=10, 
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False,
                                    random_state=np.random.RandomState(SEED) # ensure results are repeatable
)
current = time.time()

IMF.fit(toread_dataset, verbose=True)
end = time.time()
diff = end - current
print("Training took %d seconds" % (diff))

BPRMF = ImplicitFactorizationModel(n_iter=10, 
                                    embedding_dim=32, #this is Spotlight default
                                    use_cuda=False,
                                    loss='bpr',
                                    random_state=np.random.RandomState(SEED) # ensure results are repeatable
)
current = time.time()

BPRMF.fit(toread_dataset, verbose=True)
end = time.time()
diff = end - current
print("Training took %d seconds" % (diff))

Epoch 0: loss 3.8710271519471386
Epoch 1: loss 0.7940810433909541
Epoch 2: loss 0.6382512596176296
Epoch 3: loss 0.5217335244304822
Epoch 4: loss 0.4484485583837892
Epoch 5: loss 0.4054335061399663
Epoch 6: loss 0.38238631036193643
Epoch 7: loss 0.36336619852751983
Epoch 8: loss 0.351379360576145
Epoch 9: loss 0.33966925004344495
Training took 18 seconds
Epoch 0: loss 0.7356257028174851
Epoch 1: loss 0.5271942592454406
Epoch 2: loss 0.46312756476537237
Epoch 3: loss 0.42142992773146
Epoch 4: loss 0.3908996549979696
Epoch 5: loss 0.3679050632805195
Epoch 6: loss 0.34888975125438765
Epoch 7: loss 0.33679830007395656
Epoch 8: loss 0.3223336052219823
Epoch 9: loss 0.312420713564135
Training took 34 seconds
Epoch 0: loss 0.31877833036881575
Epoch 1: loss 0.18799531937770123
Epoch 2: loss 0.15685450027011474
Epoch 3: loss 0.14052333286348379
Epoch 4: loss 0.12967147759671482
Epoch 5: loss 0.12273261361526994
Epoch 6: loss 0.11806230488813148
Epoch 7: loss 0.1144794372033398
Epoch 8: loss 0.1

In [30]:
print("MRR Score of EMF:", mrr_score(EMF, test_dataset, train=rating_dataset, k=100, verbose=False).mean())
print("MRR Score of IMF:", mrr_score(IMF, test_dataset, train=rating_dataset, k=100, verbose=False).mean())
print("MRR Score of BPRMF:", mrr_score(BPRMF, test_dataset, train=rating_dataset, k=100, verbose=False).mean())

MRR Score of EMF: 0.05898399982013507
MRR Score of IMF: 0.32955393223971
MRR Score of BPRMF: 0.41691597413244963


## Task 2. Hybrid Model

(a) Linearly combine the *scores* from IMF and BPRMF.  Normalise both input scores into the range 0..1 using [sklearn's minmax_scale() function](
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.minmax_scale.html) before combining them.

(b) Apply a pipelining recommender, where the top 100 items are obtained from IMF and re-ranked using the scores of BPRMF. Items not returned by IMF get a score of 0.

To implement these hybrid models, you should create new classes that abide by the Spotlight model contract (namely, it has a `predict(self, uid)` function that returns a score for *all* items). 

Evaluate each model in terms of MRR. How many users are improved, how many are degraded compared to the BPRMF baseline?

Finally, pass your instantiated model object to the `test_Hybrid_a()` (for (a)) or `test_Hybrid_b()` (for (b)) functions, as appropriate, and record the results in the quiz. For example, if your model for (b) is called `pipeline`, then you would run:
```python
test_Hybrid_b(pipeline)
``

In [31]:
def test_Hybrid_a(combsumObj):
  for i, u in enumerate([5, 20]):
    print("Hybrid a test case %d" % i)
    print(np.count_nonzero(combsumObj.predict(u) > 1))

def test_Hybrid_b(pipeObj):
  for i, iid in enumerate([3, 0]):
    print("Hybrid b test case %d" % i)
    print(pipeObj.predict(0)[iid])

In [32]:
from sklearn.preprocessing import minmax_scale

class HybridModelA: # Combine
  def __init__(self, model_1, model_2):
    self.model_1 = model_1 
    self.model_2 = model_2

  def predict(self, uid, iids=None):
    pre_norm_m1 = minmax_scale(self.model_1.predict(uid, iids)) # predict model_1 then normalize
    pre_norm_m2 = minmax_scale(self.model_2.predict(uid, iids)) # predict model_2 then normalize
    return pre_norm_m1 + pre_norm_m2 # combine result from two models

In [33]:
from scipy.stats import rankdata

class HybridModelB: # Pipeline
  def __init__(self, model_1, model_2):
    self.model_1 = model_1 
    self.model_2 = model_2

  def predict(self, uid):
    model_1_result = self.model_1.predict(uid) # Predict and get all value from IMF
    rank_model_1 = rankdata(-model_1_result) # Get position and ranking 
    
    #print(model_1_result[rank_model_1==rank_model_1.min()]) First rank high value
    #print(model_1_result[rank_model_1==rank_model_1.max()]) Last rank lowest value
    model_2_result = self.model_2.predict(uid) # Predict and get all value from BPRMF
    model_2_result[np.argwhere(rank_model_1 > 100)] = 0 # Replace item of BPRMF with 0 by using rank position that not in top100

    return model_2_result

In [34]:
BPRMF_mrr_score = mrr_score(BPRMF, test_dataset, train=rating_dataset, k=100, verbose=False)
print("MRR Score of BPRMF(Baseline):", BPRMF_mrr_score.mean())
print("--------------------------------------------------------")

Hybrid_A_mrr_score = mrr_score(HybridModelA(IMF, BPRMF), test_dataset, train=rating_dataset, k=100, verbose=False)
print("MRR Score of Hybrid(IMF, BPRMF):", Hybrid_A_mrr_score.mean())

print("Improved users in MRR score between Hybrid and BPRMF:", np.count_nonzero(BPRMF_mrr_score < Hybrid_A_mrr_score))
print("Degraded users in degraded in MRR score between Hybrid and to BPRMF:", np.count_nonzero(BPRMF_mrr_score > Hybrid_A_mrr_score))
print("Not changed users in MRR score between Hybrid and to BPRMF:", np.count_nonzero(BPRMF_mrr_score == Hybrid_A_mrr_score))

print("--------------------------------------------------------")

Hybrid_B_mrr_score = mrr_score(HybridModelB(IMF, BPRMF), test_dataset, train=rating_dataset, k=100, verbose=False)
print("MRR Score of Hybrid Pipeline(IMF, BPRMF):", Hybrid_B_mrr_score.mean())

print("Improved users in MRR score between Pipeline and BPRMF:", np.count_nonzero(BPRMF_mrr_score < Hybrid_B_mrr_score))
print("Degraded users in MRR score between Pipeline and to BPRMF:", np.count_nonzero(BPRMF_mrr_score > Hybrid_B_mrr_score))
print("Not changed users in MRR score between Pipeline and to BPRMF:", np.count_nonzero(BPRMF_mrr_score == Hybrid_B_mrr_score))

MRR Score of BPRMF(Baseline): 0.41691597413244963
--------------------------------------------------------
MRR Score of Hybrid(IMF, BPRMF): 0.4158716450378546
Improved users in MRR score between Hybrid and BPRMF: 731
Degraded users in degraded in MRR score between Hybrid and to BPRMF: 737
Not changed users in MRR score between Hybrid and to BPRMF: 531
--------------------------------------------------------
MRR Score of Hybrid Pipeline(IMF, BPRMF): 0.4205794627838617
Improved users in MRR score between Pipeline and BPRMF: 584
Degraded users in MRR score between Pipeline and to BPRMF: 184
Not changed users in MRR score between Pipeline and to BPRMF: 1231


# Part-B. Analysing Recommendation Models

## Utility methods

In [38]:
from typing import Sequence, Tuple

def get_top_K(model, uid : int, k : int) -> Tuple[ Sequence[int], Sequence[float],  np.ndarray ] :
  #returns iids, their (normalised) scores in descending order, and item emebddings for the top k predictions of the given uid.

  from sklearn.preprocessing import minmax_scale

  from scipy.stats import rankdata
  # get scores from model
  scores = model.predict(uid)

  # map scores into rank 0..1 over the entire item space
  scores = minmax_scale(scores)

  #compute their ranks  
  ranks = rankdata(-scores)
  
  # get and filter iids, scores and embeddings
  rtr_scores = scores[ranks <= k]
  rtr_iids = np.argwhere(ranks <= k).flatten()
  if hasattr(model, '_net'):
    embs = model._net.item_embeddings.weight[rtr_iids]
  else:
    # not a model that has any embeddings
    embs = np.zeros([k,1])
  
  # identify correct ordering using numpy.argsort()
  ordering = (-1*rtr_scores).argsort()
  
  #return iids, scores and their embeddings in descending order of score
  return rtr_iids[ordering], rtr_scores[ordering], embs[ordering]

if BPRMF is not None:
  iids, scores, embs = get_top_K(BPRMF, 0, 10)
  print("Returned iids: %s" % str(iids))
  print("Returned scores: %s" % str(scores))
  print("Returned embeddings: %s" % str(embs))
else:
  print("You need to define BPRMF")



Returned iids: [143 108  21  23  11  33 106  97  77  67]
Returned scores: [1.         0.97905004 0.9210978  0.862887   0.85853875 0.8585235
 0.8488022  0.84497577 0.83934355 0.83848405]
Returned embeddings: tensor([[-1.0811,  0.6025,  2.5012, -2.3210, -1.5985,  0.9085,  1.2390,  2.2919,
         -0.8870,  2.5550,  0.8409,  0.8943, -1.2516, -2.1674, -0.9994,  1.2998,
          2.2562,  0.8593, -0.6683,  0.3298,  3.2986,  0.4097,  1.2702,  0.2691,
         -2.6738,  0.8278,  0.2319,  1.3409,  1.5672, -1.2236,  0.5193,  0.0585],
        [ 0.1896, -0.1596,  1.0429, -1.5166, -0.4200,  2.2973,  2.1719,  0.7192,
         -0.5162,  2.4166,  1.8897,  0.4072, -1.3310, -1.9191, -1.9621,  0.8644,
          1.5880,  0.5864, -1.7263, -0.0420,  2.5340, -1.5941,  1.8585,  1.6045,
         -2.6654,  1.1989,  0.1617,  0.7946,  1.2572, -2.1432,  0.7005, -0.6581],
        [-0.3373,  0.0151,  1.9853, -2.3552, -1.2441,  0.6865,  0.9093,  1.2820,
         -0.4512,  1.7876,  1.0648, -0.1178, -1.2242, -2.3798,

## Task 3. Evaluation of Non-personalised Models
Implement the following four (non-personalised) baselines for ranking books based on their statistics:
 - Average rating, obtained from ratings_df, `ratings` column
 - Number of ratings, obtained from books_df (column `ratings_count`)
 - Number of 5* ratings, obtained from books_df (column `ratings_5`)
 - Fraction of 5* ratings, calculated from the two sources of evidence above, i.e (columns  `ratings_5` and `ratings_count`).

Evaluate these in terms of MRR using the provided test data. You may use the StaticModel class below. 


In [39]:
class StaticModel:
  
  def __init__(self, staticscores):
    self.numitems = len(staticscores)
    #print(self.numitems)
    assert isinstance(staticscores, np.ndarray), "Expected a numpy array"
    assert staticscores.dtype == np.float32 or staticscores.dtype == np.float64, "Expected a numpy array of floats"
    self.staticscores = staticscores
  
  def predict(self, uid):
    #this model returns the same scores for each user    
    return self.staticscores

In [41]:
#Average rating, obtained from ratings_df, ratings column

avg_book_rating = ratings_df.groupby(['book_id']).mean() # Average rating by group book_id
all_book_array = ratings_df['book_id'].unique() # Get all unique book_id
avg_rating_array = np.zeros_like(all_book_array).astype(np.float32) # Declare zeros array for input value

for bid in all_book_array:
    avg_rating_array[iid_map[bid]] = avg_book_rating.loc[bid]['rating'] # Get average rating value 

avg_rating_mrr = mrr_score(StaticModel(avg_rating_array), test_dataset, train=rating_dataset, k=100, verbose=False).mean()
print("MRR Score of Average rating:", avg_rating_mrr)

#Number of ratings, obtained from books_df (column ratings_count)

no_rating_array = np.zeros_like(all_book_array).astype(np.float32) # Declare zeros array for input value

for bid in all_book_array:
    no_rating_array[iid_map[bid]] = books_df[books_df['book_id']==bid]['ratings_count'] # Get rating count value 

no_rating_mrr = mrr_score(StaticModel(no_rating_array), test_dataset, train=rating_dataset, k=100, verbose=False).mean()
print("MRR Score of Number of ratings:", no_rating_mrr)

#Number of 5* ratings, obtained from books_df (column ratings_5)

no_5_rating_array = np.zeros_like(all_book_array).astype(np.float32) # Declare zeros array for input value

for bid in all_book_array:
    no_5_rating_array[iid_map[bid]] = books_df[books_df['book_id']==bid]['ratings_5'] # Get 5 star rating value 
  
no_5_rating_mrr = mrr_score(StaticModel(no_5_rating_array), test_dataset, train=rating_dataset, k=100, verbose=False).mean()
print("MRR Score of Number of 5 ratings:", no_5_rating_mrr)

#Fraction of 5* ratings, calculated from the two sources of evidence above, i.e (columns ratings_5 and ratings_count).

books_df['fraction_rating'] = 0 # Declare new columns in dataframe for input value

for item in range(len(all_book_array)): # Calculate fraction rating by rating_5 divided rating_count
    books_df.loc[item, 'fraction_rating'] = (books_df['ratings_5'][item] / books_df['ratings_count'][item])

frac_rating = books_df['fraction_rating'].values.astype(np.float32) # Get fraction rating value from dataframe

frac_rating_mrr = mrr_score(StaticModel(frac_rating), test_dataset, train=rating_dataset, k=100, verbose=False).mean()
print("MRR Score of Fraction(5*and no ratings):", frac_rating_mrr)

MRR Score of Average rating: 0.015052024168984034
MRR Score of Number of ratings: 0.2396001188245477
MRR Score of Number of 5 ratings: 0.2409670879930144
MRR Score of Fraction(5*and no ratings): 0.03138372904213526


## 4. Qualiatively Examining Recommendations


In Recommender Systems, the ground truth (i.e. our list of books that the user has added to their "to_read" shelf) can be very incomplete. For instance, this can be because the user is not aware of the book yet.

For this reason, it is important to "eyeball" the recommendations, to understand what the system is surfacing, and whether the recommendations make sense. In this way, we understand if the recommendations are reasonable, even if they are for books that the user has not actually read according to the test dataset.

First, write a function, which given a uid (int), prints the *title and authors* of:
 - (a) the books that the user has previously shelved (c.f. `toread_dataset`)
 - (b) the books that the user will read in the future (c.f. `test_dataset`)
 - (c) the top 10 books that the user were recommended by `BPRMF` - you can make use of `get_top_K()`.

You can use the previously defined `getAuthorTitle()` function in your solution.
You will also want to compare books in (c) with those in (a) and (b).

Then, we will examine two specific users, namely uid 1805 (u336) and uid 179 (user u1331), to analyse if their recommendations make sense.

In [50]:
def recommend_read(uid : int):
  user_prev_pos = np.argwhere(toread_dataset.user_ids==uid) # Get position of user previously read books by search uid in toread_dataset
  user_prev_book = [item[0] for item in toread_dataset.item_ids[user_prev_pos]] # Get list of book_id by using position of user previously read books
  prev_list = list()

  print("The books that the user uid " + str(uid) + " ("+ uid_rev_map.get(uid) +") has previously shelved.")
  for index, bid in enumerate(user_prev_book):
    print(str(index+1) + "." + getAuthorTitle(bid))
    prev_list.append(getAuthorTitle(bid)) # Store author and title of books that user previously read

  print("-------------------------------------------------------------")

  user_future_pos = np.argwhere(test_dataset.user_ids==uid) # Get position of user that will read books in the future by search uid in test_dataset
  user_future_book = [item[0] for item in test_dataset.item_ids[user_future_pos]] # Get list of book_id by using position of user that will read books in the future
  future_list = list()

  print("The books that the user uid " + str(uid) + " ("+ uid_rev_map.get(uid) +") will read in the future.")
  for index, bid in enumerate(user_future_book):
    print(str(index+1) + "." + getAuthorTitle(bid))
    future_list.append(getAuthorTitle(bid)) # Store author and title of books that user will read in the future

  print("-------------------------------------------------------------")
  iids, scores, embs = get_top_K(BPRMF, uid, 10) # Get top 10 books that user were recommend by BPRMF
  rec_list = list()

  print("The top 10 books that the user uid " + str(uid) + " ("+ uid_rev_map.get(uid) +") were recommended by BPRMF.")
  for index, bid in enumerate(iids):
    print(str(index+1) + "." + getAuthorTitle(bid))
    rec_list.append(getAuthorTitle(bid)) # Store author and title of books that recommend by BPRMF

  print("-------------------------------------------------------------")
  print("Comparing recommend books list with previous and future list")

  for index, book in enumerate(rec_list):
    if book in prev_list:
      print("[PREVIOUS] " + str(index+1) + "."+ book) # If recommended book already in previous list add [PREVIOUS]
    elif book in future_list:
      print("[FUTURE] " + str(index+1) + "."+ book) # If recommended book is in future list add [FUTURE]
    else:
      print("[RECOMMEND] " + str(index+1) + "."+ book) # If recommended book is not in previous or future list then add [FUTURE]

uid_test = 1805
uid_mrr = mrr_score(BPRMF, test_dataset, train=rating_dataset, k=100, verbose=False).mean()
print("MRR score of uid " + str(uid_test) + " ("+ uid_rev_map.get(uid_test) +")" + str(uid_mrr))
recommend_read(uid_test)

MRR score of uid 1805 (u336)0.41691597413244963
The books that the user uid 1805 (u336) has previously shelved.
1.Stieg Larsson, Reg Keeland / The Girl Who Kicked the Hornet's Nest (Millennium, #3)
2.Suzanne Collins / Mockingjay (The Hunger Games, #3)
3.Dennis Lehane / Shutter Island
4.Suzanne Collins / Catching Fire (The Hunger Games, #2)
5.Paula Hawkins / The Girl on the Train
6.Robert Ludlum / The Bourne Supremacy (Jason Bourne, #2)
7.John Grisham / The Client
8.Thomas Harris / The Silence of the Lambs  (Hannibal Lecter, #2)
9.Daphne du Maurier, Sally Beauman / Rebecca
10.Robert Ludlum / The Bourne Identity (Jason Bourne, #1)
11.Robert Galbraith, J.K. Rowling / The Cuckoo's Calling (Cormoran Strike, #1)
12.Stephen King / Misery
13.Michael Crichton / Jurassic Park (Jurassic Park, #1)
14.Robert Ludlum / The Bourne Ultimatum (Jason Bourne, #3)
15.Stephen King, Bernie Wrightson / The Stand
16.Michael Crichton / The Andromeda Strain
17.Thomas Harris / Red Dragon (Hannibal Lecter, #1)
18.

# Part-C. Diversity of Recommendations

## 5. Measuring Intra-List Diversity


For the BPR implicit factorisation model, implement the Intra-list diversity measure of the top 5 scored items based on their item embeddings in the `BPRMF` model. 

where:
 - `top_books` is a list or a Numpy array of iids that have been returned for a particular user. For instance, it can be obtained from `get_top_K()`.
 - `K` is the number of top-ranked items to consider from `top_books`. 
 - Your implementation should use the item emebddings stored in the `BPRMF` model.

In [54]:
import torch.nn as nn

def measure_ild(top_books : Sequence[int], K : int=5) -> float:
  distance_sum = 0
  # Calculate cosine similarity between top_books list
  for item1 in range(K-1):
    for item2 in range((item1),K): 
      cosim = nn.functional.cosine_similarity(
          BPRMF._net.item_embeddings.weight[top_books[item1]],
          BPRMF._net.item_embeddings.weight[top_books[item2]], dim=0)
      distance_sum += (1-cosim.item())
  # Calculate ILD with K and total distance for all top_books
  ILD = ( 2 / ( K * ( K - 1 ) ) ) * distance_sum
  return ILD

uid = 1805
k = 5
top_books, scores, embs = get_top_K(BPRMF, uid, k) # Get top_books with input uid and k

print("The top 5 books of user uid " + str(uid) + " ("+ uid_rev_map.get(uid) +") are.")

for index, bid in enumerate(top_books):
  print(str(index+1) + "." + getAuthorTitle(bid) + " iid " + str(bid) + " book_id (" + iid_rev_map.get(bid) +")")
user_ild = measure_ild(top_books)
print("The Intra-list diversity measure for user (ILD)" + str(uid) + " ("+ uid_rev_map.get(uid) +") is: " + str(user_ild))

The top 5 books of user uid 1805 (u336) are.
1.Scott Turow / Presumed Innocent iid 1078 book_id (b966)
2.Lee Child / Running Blind (Jack Reacher, #4) iid 1255 book_id (b1870)
3.Lee Child, Dick Hill / Without Fail (Jack Reacher, #6) iid 1136 book_id (b1617)
4.John Grisham / The Firm (Penguin Readers, Level 5) iid 51 book_id (b123)
5.Lee Child / Killing Floor (Jack Reacher, #1) iid 1251 book_id (b477)
The Intra-list diversity measure for user (ILD)1805 (u336) is: 0.5686542078852653


In [55]:
uid = 179
k = 5
top_books, scores, embs = get_top_K(BPRMF, uid, k) # Get top_books with input uid and k

print("The top 5 books of user uid " + str(uid) + " ("+ uid_rev_map.get(uid) +") are.")

for index, bid in enumerate(top_books):
  print(str(index+1) + "." + getAuthorTitle(bid) + " iid " + str(bid) + " book_id (" + iid_rev_map.get(bid) +")")
user_ild = measure_ild(top_books)
print("The Intra-list diversity measure for user (ILD)" + str(uid) + " ("+ uid_rev_map.get(uid) +") is: " + str(user_ild))

The top 5 books of user uid 179 (u1331) are.
1.J.K. Rowling, Mary GrandPré / Harry Potter and the Deathly Hallows (Harry Potter, #7) iid 581 book_id (b25)
2.J.K. Rowling, Mary GrandPré / Harry Potter and the Goblet of Fire (Harry Potter, #4) iid 11 book_id (b24)
3.J.K. Rowling, Mary GrandPré / Harry Potter and the Half-Blood Prince (Harry Potter, #6) iid 189 book_id (b27)
4.J.K. Rowling, Mary GrandPré / Harry Potter and the Chamber of Secrets (Harry Potter, #2) iid 10 book_id (b23)
5.J.K. Rowling, Mary GrandPré / Harry Potter and the Sorcerer's Stone (Harry Potter, #1) iid 9 book_id (b2)
The Intra-list diversity measure for user (ILD)179 (u1331) is: 0.1023072898387909


## 6. Implement MMR Diversification 

Develop an Maximal Marginal Relevance (M**M**R) diversification technique, to re-rank the top-ranked recommendations for a given user.

where iids is a list of iids, scores are their corresponding scores (in descending order), embs is their embeddings, and alpha controls the diversification tradeoff. The function returns a re-ordering of iids. As in previous Exercises, type hints are provided for clarity; a Sequence can be a list or numpy array. 

In [56]:
from typing import Sequence
def mmr(iids : Sequence[int], scores : Sequence[float], embs : np.ndarray, alpha : float) -> Sequence[int]:

  assert len(iids) == len(scores)
  assert len(iids) == embs.shape[0]
  assert len(embs.size()) == 2

  #input your solution here returns a re-ordering of iids, such that the first ranked item is first in the list
  
  # Get first position by finding maximum score
  get_max_pos = np.argwhere(scores==np.max(scores))[0] 
  # Input first position in set
  rtr_pos_set = {pos.item() for pos in get_max_pos}
  # Input first iids in list
  rtr_iids_list = [iids[pos.item()] for pos in get_max_pos]  

  for item1 in range(len(iids)-1):
    topmmr_pos = -5
    topmmr = -5
    for item2 in range(len(iids)):
      if item2 in rtr_pos_set:
        # If target compare item in the set then skip
        continue
        #print(item2)

      max_list = list()
      # List of compare between item in set with other item that not in set
      for pos in rtr_pos_set:
        max_list.append(nn.functional.cosine_similarity(embs[item2], embs[pos], dim=0))
      # Get maximum similarity value from list of compare item
      maxsim = np.max(np.asarray(max_list))
      
      # Calculate MMR score for target item
      mmr = ( alpha * scores[item2] ) - ( ( 1.0 - alpha ) * maxsim )  

      # If MMR score of target item more than current top MMR score then replace
      if mmr > topmmr:
        topmmr_pos = item2
        topmmr = mmr
        #print(topmmr_pos)
        #print(topmmr)

    # After find highest MMR score and their position then put in set and list
    if topmmr_pos != -5:
      rtr_pos_set.add(topmmr_pos)
      rtr_iids_list.append(iids[topmmr_pos])  

  #print(rtr_pos_set)
  #print(rtr_iids_list)
  return rtr_iids_list

Now we can analyse the impact of our MMR implementation. Let's consider again uid 179 (user u1331). 

Apply MMR on the top 10 results obtained from the BPRMF model using `get_top_K()`, with an alpha value of 0.5. The following code should help:
```python
mmr( *get_top_K(bprmodel, 179, 10), 0.5)
```

Finally, anayse the returned books. Calculate the ILD (with `k=5`), and examine the authors and titles (using `getAuthorTitle()`). 

In [580]:
#add your solution here

top_k = get_top_K(BPRMF, 179, 10)
print("Top 10 result: ", top_k[0])

ild_top_k = measure_ild(top_k[0], K=5)
print("ILD of TOP_K: ", ild_top_k)

for index,mid in enumerate(top_k[0][0:5]):
  print(str(index+1) + "." + getAuthorTitle(mid))

print("------------------------------------------------------------------")

top_mmr = mmr( *get_top_K(BPRMF, 179, 10), 0.5)
print("Top 10 result after MMR", top_mmr)

ild_top_mmr = measure_ild(top_mmr, K=5)
print("ILD of MMR: ", ild_top_mmr)

for index,mid in enumerate(top_mmr[0:5]):
  print(str(index+1) + "." + getAuthorTitle(mid) + " iid " + str(mid))

Top 10 result:  [581  11 189  10   9 143   8  33  75  21]
ILD of TOP_K:  0.10230576992034912
1.J.K. Rowling, Mary GrandPré / Harry Potter and the Deathly Hallows (Harry Potter, #7)
2.J.K. Rowling, Mary GrandPré / Harry Potter and the Goblet of Fire (Harry Potter, #4)
3.J.K. Rowling, Mary GrandPré / Harry Potter and the Half-Blood Prince (Harry Potter, #6)
4.J.K. Rowling, Mary GrandPré / Harry Potter and the Chamber of Secrets (Harry Potter, #2)
5.J.K. Rowling, Mary GrandPré / Harry Potter and the Sorcerer's Stone (Harry Potter, #1)
------------------------------------------------------------------
Top 10 result after MMR [581, 33, 75, 11, 143, 10, 189, 9, 8, 21]
ILD of MMR:  0.2888532817363739
1.J.K. Rowling, Mary GrandPré / Harry Potter and the Deathly Hallows (Harry Potter, #7) iid 581
2.George Orwell / Animal Farm iid 33
3.Dan Brown / Angels & Demons  (Robert Langdon, #1) iid 75
4.J.K. Rowling, Mary GrandPré / Harry Potter and the Goblet of Fire (Harry Potter, #4) iid 11
5.George Or