## Amazon and IMDB recommendation systems
**Adapted from UBC DSCI 563 (unsupervised learning)Lab 4**

In [7]:
import os
import json
import pickle
import numpy as np
import random
import altair as alt
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
from scipy.sparse import csr_matrix as sparse_matrix

from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
alt.data_transformers.disable_max_rows()
#alt.data_transformers.enable('data_server')
alt.data_transformers.enable('json')

DataTransformerRegistry.enable('json')

## Load and Wrangle Data

In [3]:
filename = "ratings_Patio_Lawn_and_Garden.csv"

with open(os.path.join("../data", filename), "rb") as f:
    ratings = pd.read_csv(f, names=("user", "item", "rating", "timestamp"))
ratings.head()

Unnamed: 0,user,item,rating,timestamp
0,A2VNYWOPJ13AFP,981850006,5.0,1259798400
1,A20DWVV8HML3AW,981850006,5.0,1371081600
2,A3RVP3YBYYOPRH,981850006,5.0,1257984000
3,A28XY55TP3Q90O,981850006,5.0,1314144000
4,A3VZW1BGUQO0V3,981850006,5.0,1308268800


In [5]:
def get_stats(ratings, item_key="item", user_key="user"):
    print("Number of ratings:", len(ratings))
    print("The average rating:", np.mean(ratings["rating"]))

    n = len(set(ratings[item_key]))
    d = len(set(ratings[user_key]))
    print("Number of users:", d)
    print("Number of items:", n)
    print("Fraction nonzero:", len(ratings)/(n*d))
    print("Size of full X matrix (GB):", (n*d)*8/1e9)

    return n, d


n, d = get_stats(ratings)

def create_X(ratings, n, d, user_key="user", item_key="item"):
    """
    Creates a sparse matrix using scipy.csr_matrix and mappers to relate indexes to items' id.
    
    Parameters:
    -----------
    ratings: pd.DataFrame
        the ratings to be stored in the matrix;
    n: int
        the number of items
    d: int
        the number of users
    user_key: string
        the column in ratings that contains the users id
    item_key: string
        the column in ratings that contains the items id
    
    Returns: (X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind)
    --------
    X: np.sparse
        the sparse matrix containing the ratings.
    user_mapper: dict
        stores the indexes of the users - the user_id is the key;
    item_mapper: dict
        stores the indexes of the items - the item_id is the key;
    user_inverse_mapper: dict
        stores the user id - the user index is the key;
    item_inverse_mapper: dict
        stores the item id - the item index is the key;
    user_ind: list
        indexes of the users (in the order they are in ratings);
    item_ind: list
        indexes of the items;
    """
    
    user_mapper = dict(zip(np.unique(ratings[user_key]), list(range(d))))
    item_mapper = dict(zip(np.unique(ratings[item_key]), list(range(n))))

    user_inverse_mapper = dict(zip(list(range(d)), np.unique(ratings[user_key])))
    item_inverse_mapper = dict(zip(list(range(n)), np.unique(ratings[item_key])))

    user_ind = [user_mapper[i] for i in ratings[user_key]]
    item_ind = [item_mapper[i] for i in ratings[item_key]]

    X = sparse_matrix((ratings["rating"], (item_ind, user_ind)), shape=(n,d))
    
    return X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind

X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind = create_X(ratings, n, d)

Number of ratings: 993490
The average rating: 4.006400668350965
Number of users: 714791
Number of items: 105984
Fraction nonzero: 1.3114269915944552e-05
Size of full X matrix (GB): 606.051274752


In [6]:
# sanity check
print(X.shape) # should be number of items by number of users
print(X.nnz)   # number of nonzero elements -- should equal number of ratings
print(f"Using sparse matrix data structure, the size of X is: {X.data.nbytes/1e6}mb")

(105984, 714791)
993490
Using sparse matrix data structure, the size of X is: 7.94792mb


In [23]:
movie_ratings = pd.read_csv(os.path.join("../data", "ml-latest-small", "ratings.csv"))
movie_ratings.head()
movie_n, movie_d = get_stats(movie_ratings, user_key="userId", item_key="movieId")
movie_X, user_mapper, movie_mapper, user_inverse_mapper, movie_inverse_mapper, user_ind, movie_ind = create_X(movie_ratings, movie_n, movie_d, user_key="userId", item_key="movieId")
toy_story_ind = 0
toy_story_vec = movie_X[toy_story_ind]

movie_info = pd.read_csv(os.path.join("../data", "ml-latest-small", "movies.csv"),index_col=0)

Number of ratings: 100836
The average rating: 3.501556983616962
Number of users: 610
Number of items: 9724
Fraction nonzero: 0.016999683055613623
Size of full X matrix (GB): 0.04745312


## Amazon
### Mr Grill Euclidean distance
Eculidean distance to fined the most similar products [product link](https://www.amazon.com/dp/B00IJB5MCS)

In [8]:
grill_spatula = "B00IJB5MCS"
grill_spatula_ind = item_mapper[grill_spatula]
grill_spatula_vec = X[grill_spatula_ind]

In [9]:
knn = NearestNeighbors(n_neighbors=7)
knn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                 radius=1.0)

In [10]:
grill_spatular_knn = knn.kneighbors(grill_spatula_vec)

In [12]:
grill_spatular_knn_idx = grill_spatular_knn[1][0]
grill_spatular_knn_idx = grill_spatular_knn_idx.tolist()

#### Product recommendations

In [13]:
for i in range(0, len(grill_spatular_knn_idx)):
    item_name = item_inverse_mapper[grill_spatular_knn_idx[i]]
    link = "https://www.amazon.com/dp/" + item_name
    display(HTML('<a href="%s">%s</a>' % (link, link)))

Conclusion
Eculidean distance is not doing a great job for recommending. 

### Mr Grill Cosine Similarity

In [16]:
knn = NearestNeighbors(n_neighbors=7, metric="cosine")
knn.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                 radius=1.0)

In [17]:
grill_spatular_knn_cosine = knn.kneighbors(grill_spatula_vec)

In [18]:
grill_spatular_knn_cosine_idx = grill_spatular_knn_cosine[1][0]
grill_spatular_knn_cosine_idx = grill_spatular_knn_cosine_idx.tolist()

**Product Recommendation**

In [19]:
for i in range(0, len(grill_spatular_knn_cosine_idx)):
    item_name = item_inverse_mapper[grill_spatular_knn_cosine_idx[i]]
    link = "https://www.amazon.com/dp/" + item_name
    display(HTML('<a href="%s">%s</a>' % (link, link)))

Conclution Cosine Similairty is doing a better job at recommending similiar items than Euclidean Distance 

## IMDB movie recommendations
### Toy Story Euclidean Distance 

In [25]:
movie_info.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [26]:
knn = NearestNeighbors(n_neighbors=7)
knn.fit(movie_X)
toy_story_knn = knn.kneighbors(toy_story_vec)

In [29]:
toy_story_knn_idx = toy_story_knn[1][0]
toy_story_knn_idx = toy_story_knn_idx.tolist()

In [30]:
movie_id = []
for i in range(0, len(toy_story_knn_idx)):
    movie_id.append(movie_inverse_mapper[toy_story_knn_idx[i]])

**Movie Recommendation**

In [31]:
movie_info.loc[movie_id]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
788,"Nutty Professor, The (1996)",Comedy|Fantasy|Romance|Sci-Fi
1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical


### Toy Cosine Similarity

In [32]:
knn = NearestNeighbors(n_neighbors=7, metric="cosine")
knn.fit(movie_X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                 radius=1.0)

In [33]:
toy_story_knn_cosine = knn.kneighbors(toy_story_vec)
toy_story_knn_cosine_idx = toy_story_knn_cosine[1][0]
toy_story_knn_cosine_idx = toy_story_knn_cosine_idx.tolist()

In [34]:
movie_id = []
for i in range(0, len(toy_story_knn_cosine_idx)):
    movie_id.append(movie_inverse_mapper[toy_story_knn_cosine_idx[i]])

**Movie Recommendation**

In [37]:
movie_info.loc[movie_id]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
356,Forrest Gump (1994),Comedy|Drama|Romance|War
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX


**Conclution**

Cosine similairty it is trying to recommend movies that are more critically acclaimed vs child friendly because it taking into consideration star ratings. Euclidian distance is doing the best job.