In [1]:
# Imported for our sanity
import warnings
warnings.filterwarnings('ignore')

# Our regular old heroes 
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy. 
import matplotlib.pyplot as plt
import seaborn as sns

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

## Content-based

In [4]:
def data_preprocessing(subset_size):
    """Prepare data for use within Content filtering algorithm.
    Parameters
    ----------
    subset_size : int
        Number of movies to use within the algorithm.
    Returns
    -------
    Pandas Dataframe
        Subset of movies selected for content-based filtering.
    """
    # Split genre data into individual words.
    movies['keyWords'] = movies['genres'].str.replace('|', ' ')
    # Subset of the data
    movies_subset = movies[:subset_size]
    return movies_subset

movies = pd.read_csv('edsa-recommender-system-predict/movies.csv')
movies = data_preprocessing(10000)

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres,keyWords
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [5]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [6]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tf_keyWords_matrix = tf.fit_transform(movies['keyWords'])

In [7]:
cosine_sim_keyWords = cosine_similarity(tf_keyWords_matrix, tf_keyWords_matrix)
print (cosine_sim_keyWords.shape)

(10000, 10000)


In [8]:
def content_generate_top_N_recommendations(movie_title, N=10):
    # Convert the string movie title to a numeric index for our similarity matrix
    b_idx = indices[movie_title]
    # Extract all similarity values computed with the reference movie title
    sim_scores = list(enumerate(cosine_sim_keyWords[b_idx]))
    # Sort the values, keeping a copy of the original index of each value
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Select the top-N values for recommendation
    sim_scores = sim_scores[1:N]
    # Collect indexes 
    movie_indices = [i[0] for i in sim_scores]
    # Convert the indexes back into titles 
    return titles.iloc[movie_indices]

In [10]:
content_generate_top_N_recommendations('Toy Story (1995)', N=10)

2203                                          Antz (1998)
3021                                   Toy Story 2 (1999)
3653       Adventures of Rocky and Bullwinkle, The (2000)
3912                     Emperor's New Groove, The (2000)
4780                                Monsters, Inc. (2001)
9949    DuckTales: The Movie - Treasure of the Lost La...
8748    Twelve Tasks of Asterix, The (Les douze travau...
4201                                         Shrek (2001)
2051                             American Tail, An (1986)
Name: title, dtype: object

In [23]:
def content_generate_rating_estimate(movie_title, user, rating_data, k=20, threshold=0.0):
    # Convert the book title to a numeric index for our 
    # similarity matrix
    b_idx = indices[movie_title]
    neighbors = [] # <-- Stores our collection of similarity values 
     
    # Gather the similarity ratings between each book the user has rated
    # and the reference book 
    for index, row in rating_data[rating_data['userId']==user].iterrows():
        sim = cosine_sim_keyWords[b_idx-1, indices[row['title']]-1]
        neighbors.append((sim, row['rating']))
    # Select the top-N values from our collection
    k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[0])

    # Compute the weighted average using similarity scoress and 
    # user item ratings. 
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_neighbors:
        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        predictedRating = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user. 
        # We use the average rating for the reference item as a proxy in this case 
        predictedRating = np.mean(rating_data[rating_data['title']==movie_title]['rating'])
    return predictedRating

In [100]:
movie_ratings = pd.read_csv('edsa-recommender-system-predict/train.csv')
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [101]:
# Subset of ratings from user 314
movie_ratings = movie_ratings.merge(movies,on='movieId')
test_set = movie_ratings[movie_ratings['userId'] == 314][3:10]
test_set

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,keyWords
1432459,314,28,5.0,843578031,Persuasion (1995),Drama|Romance,Drama Romance
1727322,314,552,3.0,843575928,"Three Musketeers, The (1993)",Action|Adventure|Comedy|Romance,Action Adventure Comedy Romance
2256597,314,497,5.0,843574174,Much Ado About Nothing (1993),Comedy|Romance,Comedy Romance
2973449,314,648,3.0,848713107,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,Action Adventure Mystery Thriller
3336050,314,586,3.0,843573190,Home Alone (1990),Children|Comedy,Children Comedy
3397160,314,17,5.0,843573345,Sense and Sensibility (1995),Drama|Romance,Drama Romance
3671516,314,252,4.0,843573423,I.Q. (1994),Comedy|Romance,Comedy Romance


In [30]:
title = "Persuasion (1995)"
actual_rating = test_set[(test_set['userId'] == 314) & (test_set['title'] == title)]['rating'].values[0]
pred_rating = content_generate_rating_estimate(movie_title=title, user=314, rating_data=movie_ratings)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

Title - Persuasion (1995)
---
Actual rating: 		 5.0
Predicted rating: 	 4.236621545287473


In [31]:
title = "Home Alone (1990)"
actual_rating = test_set[(test_set['userId'] == 314) & (test_set['title'] == title)]['rating'].values[0]
pred_rating = content_generate_rating_estimate(movie_title=title, user=314, rating_data=movie_ratings)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

Title - Home Alone (1990)
---
Actual rating: 		 3.0
Predicted rating: 	 3.338192124908175


## Collaborative

In [102]:
movie_ratings= movie_ratings[:20000]
movie_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,keyWords
0,106343,5,4.5,1206238739,Father of the Bride Part II (1995),Comedy,Comedy
1,113437,5,4.0,834660257,Father of the Bride Part II (1995),Comedy,Comedy
2,29201,5,3.0,852207199,Father of the Bride Part II (1995),Comedy,Comedy
3,85599,5,3.0,866033371,Father of the Bride Part II (1995),Comedy,Comedy
4,139773,5,3.0,1463968262,Father of the Bride Part II (1995),Comedy,Comedy
...,...,...,...,...,...,...,...
19995,45602,33493,2.0,1238862687,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi,Action Adventure Sci-Fi
19996,22769,33493,4.5,1507405951,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi,Action Adventure Sci-Fi
19997,45960,33493,2.5,1120078301,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi,Action Adventure Sci-Fi
19998,27085,33493,3.0,1490300998,Star Wars: Episode III - Revenge of the Sith (...,Action|Adventure|Sci-Fi,Action Adventure Sci-Fi


In [103]:
util_matrix = movie_ratings.pivot_table(index=['userId'], columns=['title'], values='rating') 
util_matrix.shape

(17612, 8)

In [104]:
# Normalize each row (a given user's ratings) of the utility matrix
util_matrix_norm = util_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Fill Nan values with 0's, transpose matrix, and drop users with no ratings
util_matrix_norm.fillna(0, inplace=True)
util_matrix_norm = util_matrix_norm.T
util_matrix_norm = util_matrix_norm.loc[:, (util_matrix_norm != 0).any(axis=0)]
# Save the utility matrix in scipy's sparse matrix format
util_matrix_sparse = sp.sparse.csr_matrix(util_matrix_norm.values)

In [105]:
# Compute the similarity matrix using the cosine similarity metric
user_similarity = cosine_similarity(util_matrix_sparse.T)
# Save the matrix as a dataframe to allow for easier indexing  
user_sim_df = pd.DataFrame(user_similarity, index = util_matrix_norm.columns, columns = util_matrix_norm.columns)
# Review a small portion of the constructed similartiy matrix  
user_sim_df[:5]

userId,125,431,440,695,865,891,939,1067,1228,1401,...,160886,160896,160951,161342,161367,161583,162245,162387,162484,162516
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
125,1.0,0.5,0.5,0.0,0.5,-0.5,-0.5,-0.5,0.5,0.20739,...,0.5,0.5,-0.1066,-0.5,0.0,0.5,0.5,0.0,0.0,-0.5
431,0.5,1.0,-0.5,0.0,0.5,0.5,0.5,0.0,-0.5,-0.570323,...,-0.5,-0.5,-0.1066,0.0,-0.5,-0.5,-0.5,-0.5,0.0,-1.0
440,0.5,-0.5,1.0,0.0,0.0,-1.0,-1.0,-0.5,1.0,0.777714,...,1.0,1.0,0.0,-0.5,0.5,1.0,1.0,0.5,0.0,0.5
695,0.0,0.0,0.0,1.0,-0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
865,0.5,0.5,0.0,-0.5,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.426401,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5


In [106]:
def collab_generate_top_N_recommendations(user, N=10, k=20):
    # Cold-start problem - no ratings given by the reference user. 
    # With no further user data, we solve this by simply recommending
    # the top-N most popular books in the item catalog. 
    if user not in user_sim_df.columns:
        return movie_ratings.groupby('title').mean().sort_values(by='rating',ascending=False).index[:N].to_list()
    
    # Gather the k users which are most similar to the reference user 
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    favorite_user_items = [] # <-- List of highest rated items gathered from the k users  
    most_common_favorites = {} # <-- Dictionary of highest rated items in common for the k users
    
    for i in sim_users:
        # Maximum rating given by the current user to an item 
        max_score = util_matrix_norm.loc[:, i].max()
        # Save the names of items maximally rated by the current user   
        favorite_user_items.append(util_matrix_norm[util_matrix_norm.loc[:, i]==max_score].index.tolist())
        
    # Loop over each user's favorite items and tally which ones are 
    # most popular overall.
    for item_collection in range(len(favorite_user_items)):
        for item in favorite_user_items[item_collection]: 
            if item in most_common_favorites:
                most_common_favorites[item] += 1
            else:
                most_common_favorites[item] = 1
    # Sort the overall most popular items and return the top-N instances
    sorted_list = sorted(most_common_favorites.items(), key=operator.itemgetter(1), reverse=True)[:N]
    top_N = [x[0] for x in sorted_list]
    return top_N  

In [107]:
# Our recommended list for user 314
collab_generate_top_N_recommendations(106343)

['Exorcist, The (1973)',
 'Star Wars: Episode III - Revenge of the Sith (2005)',
 'Pecker (1998)',
 'Orange County (2002)',
 'Father of the Bride Part II (1995)',
 'Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)',
 "Wes Craven's New Nightmare (Nightmare on Elm Street Part 7: Freddy's Finale, A) (1994)",
 'Miss Congeniality 2: Armed and Fabulous (2005)']

In [108]:
movie_ratings[movie_ratings['userId'] == 106343][:][['title','rating']]

Unnamed: 0,title,rating
0,Father of the Bride Part II (1995),4.5


In [112]:
def collab_generate_rating_estimate(movie_title, user, k=20, threshold=0.0):
    # Gather the k users which are most similar to the reference user 
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    # Store the corresponding user's similarity values 
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k+1]
    rating_list = [] # <-- List of k user's ratings for the reference item
    weight_list = [] # <-- List of k user's similarities to the reference user
    
    # Create a weighted sum for each of the k users who have rated the 
    # reference item (book).
    for sim_idx, user_id in enumerate(sim_users):
        # User's rating of the item
        rating = util_matrix.loc[user_id, movie_title]
        # User's similarity to the reference user 
        similarity = user_values[sim_idx]
        # Skip the user if they have not rated the item, or are too dissimilar to 
        # the reference user
        if (np.isnan(rating)) or (similarity < threshold):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    try:
        # Return the weighted sum as the predicted rating for the reference item
        predicted_rating = sum(rating_list)/sum(weight_list) 
    except ZeroDivisionError:
        # If no ratings for the reference item can be collected, return the average 
        # rating given by all users for the item.  
        predicted_rating = np.mean(util_matrix[movie_title])
    return predicted_rating

In [121]:
title = "Pecker (1998)"
actual_rating = movie_ratings[(movie_ratings['userId'] == 162484) & (movie_ratings['title'] == title)]['rating'].values[0]
pred_rating = collab_generate_rating_estimate(movie_title = title, user = 162484)
print (f"Title - {title}")
print ("---")
print (f"Actual rating: \t\t {actual_rating}")
print (f"Predicted rating: \t {pred_rating}")

Title - Pecker (1998)
---
Actual rating: 		 3.5
Predicted rating: 	 3.5128756097151235


In [123]:
test = pd.read_csv('edsa-recommender-system-predict/test.csv')
test = pd.merge(test,movies[['movieId','title']],on='movieId', how='left')
test.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [None]:
ls = []
for i in subset.index:
    ls.append(collab_generate_rating_estimate(movie_title = subset.loc[i]['title'], user = subset.loc[i]['userId']))
test['rating'] = ls
test.head()