In [1]:
"""
Contains various recommondation implementations
all algorithms return a list of movieids
"""

import pandas as pd
import numpy as np
from utils import lookup_movieId, match_movie_title



In [2]:
user_rating = {
    'the lion king': 5,
    'terminator': 5,
    'star wars': 2
}

In [3]:
user_mat = pd.read_csv('data/cleaned_user_item_matrix.csv', index_col=[0])

In [4]:
dictionary = pd.read_csv('data/cleaned_movies_dictionary.csv', index_col=[0])

In [5]:
movies = pd.read_csv('data/movies_clusters_ratings.csv', index_col='movieid')  

In [6]:
def unseen_movies(movies, user_rating):
    """
    return list of all unseen movies by selecting the movies without rating by user 
    """
    user = pd.DataFrame(user_rating, index=[0])
    user_t = user.T.reset_index()
    user_movie_entries = list(user_t["index"])
    movie_titles = list(movies["title"])
    parsed_title = [match_movie_title(title, movie_titles) for title in user_movie_entries]
    
    unseen_movies = movies.copy()
    unseen_movies = unseen_movies.reset_index()
    unseen_movies = unseen_movies.set_index("title")
    unseen_movies.drop(parsed_title, inplace=True)
    
    return unseen_movies

In [7]:
def recommend_random(movies, user_rating, k=5):
    """
    return k random unseen movies for user 
    """
    recommend = unseen_movies(movies, user_rating)
    random_movies = np.random.choice(list(recommend.index), replace=False, size=k)
    
    return random_movies

In [8]:
def get_popularity(user_mat, dictionary):
    user_mat = user_mat.T
    user_mat = user_mat.astype(float)
    
    null_count = []
    for i in user_mat.index:
        count = user_mat.loc[[i]].isna().sum().sum()
        null_count.append(count)
    
    user_mat['null_count'] = null_count
    user_mat['total_top_rating'] = user_mat.select_dtypes(np.number).gt(3.5).sum(axis=1)
    popular = user_mat[['null_count','total_top_rating']]
    popular = popular.reset_index()
    popular['movieid'] = popular['index'].astype(int)
    popular.set_index('movieid', inplace=True)
    movies_popularity = popular.join(dictionary)
    movies_popularity = movies_popularity.drop('index', axis=1)
    
    return movies_popularity

In [9]:
def recommend_most_popular(user_rating, movies, k=5):
    """
    return k movies from list of 50 best rated movies unseen for user
    """
    popularity = get_popularity(user_mat, dictionary)
    movies = movies.join(popularity[['null_count','total_top_rating']])
    recommend = unseen_movies(movies, user_rating)
    # To get the most popular, you need to join the table with user ids and then check the movie that has been rated by most users!
    
    most_popular = list(recommend.sort_values('total_top_rating', ascending=False).head(k).index)

    return most_popular

In [10]:
recommend_most_popular(user_rating, movies)

['Shawshank Redemption, The',
 'Forrest Gump',
 'Pulp Fiction',
 'Silence of the Lambs, The',
 'Matrix, The']

In [11]:
def get_cluster(movies, user_rating):
    """
    return list of all unseen movies by selecting the movies without rating by user 
    """
    user = pd.DataFrame(user_rating, index=[0])
    user_t = user.T.reset_index()
    user_movie_entries = list(user_t["index"])
    movie_titles = list(movies["title"])
    parsed_title = [match_movie_title(title, movie_titles) for title in user_movie_entries]
    user_cluster = list(movies['cluster_no'].loc[movies.apply(lambda x: x.title in parsed_title, axis=1)])
    
    return user_cluster

In [12]:
def recommend_from_same_cluster(user_rating, movies, k=3):
    user_cluster = get_cluster(movies, user_rating)
    popularity = get_popularity(user_mat, dictionary)
    movies = movies.join(popularity[['null_count','total_top_rating']])
    unseen = unseen_movies(movies, user_rating)
    movies_same_cluster = unseen.loc[unseen.apply(lambda x: x.cluster_no in user_cluster, axis=1)]
    
    top_20 = movies_same_cluster.sort_values('total_top_rating',ascending = False).groupby('cluster_no').head(20)
    recommend_2 = top_20.sample(frac = 1.0).groupby('cluster_no').head(5)
    recommend_cluster = list(recommend_2.index.values)
            
    return recommend_cluster

In [13]:
recommend_from_same_cluster(user_rating, movies)

['Guardians of the Galaxy',
 'Spirited Away (Sen to Chihiro no kamikakushi)',
 'Serenity',
 'Who Framed Roger Rabbit?',
 'Up',
 'Monsters, Inc.',
 'Star Wars: Episode III - Revenge of the Sith',
 'Star Wars: Episode V - The Empire Strikes Back',
 'Aliens',
 'Toy Story']

In [14]:
from sklearn.impute import SimpleImputer

In [76]:
from sklearn.decomposition import NMF

In [16]:
user_mat

Unnamed: 0_level_0,1,2,3,4,5,6,915,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [137]:
# process user-item matrix
user_r = user_mat.T.reset_index()
user_r['movieid'] = user_r['index'].astype(int)
user_r.set_index('movieid', inplace=True)
user_title = user_r.join(dictionary)
user_title.set_index('title', inplace=True)
user_title.drop('index', axis=1, inplace=True)

# syn user rating with user-item matrix
user_df = pd.DataFrame(user_rating, index=[0])
user_t = user_df.T.reset_index()
user_movie_entries = list(user_t['index'])
movie_titles = list(user_title.index.values)
parsed_title = [match_movie_title(title, movie_titles) for title in user_movie_entries]
user_df.columns = parsed_title
user_df = user_df.T
user_df.rename_axis(index='title', inplace=True)

# remove null values
all_user_info = user_title.join(user_df)
user = all_user_info[0]
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(all_user_info.T)
impute_transform = imputer.transform(all_user_info.T)
new_df = pd.DataFrame(impute_transform.T, columns=all_user_info.columns, index=all_user_info.index.values)

# run nmf
nmf = NMF(n_components=20, max_iter=1000)
nmf.fit(new_df)
P = nmf.transform(new_df)
Q = nmf.components_
Q_df = pd.DataFrame(Q, columns=new_df.columns)
PQ = P.dot(Q)
PQ_df = pd.DataFrame(PQ, columns=new_df.columns, index=new_df.index.values)

#recommendation
top_20 = PQ_df[0].sort_values(ascending = False).head(20)
recommend_5 = top_20.sample(frac = 1.0).head(5)
recommend_nmf = list(recommend_5.index.values)




['Rules of Attraction, The',
 'Highlander',
 'Hard-Boiled (Lat sau san taam)',
 'Batman: Mask of the Phantasm',
 'Rosencrantz and Guildenstern Are Dead']

In [138]:
def recommend_with_NMF(user_mat, user_rating, dictionary, k=5):
    """
    NMF Recommender
    INPUT
    - user_vector with shape (1, #number of movies)
    - user_item_matrix
    - trained NMF model

    OUTPUT
    - a list of movieIds
    """
    # process user-item matrix
    user_r = user_mat.T.reset_index()
    user_r['movieid'] = user_r['index'].astype(int)
    user_r.set_index('movieid', inplace=True)
    user_title = user_r.join(dictionary)
    user_title.set_index('title', inplace=True)
    user_title.drop('index', axis=1, inplace=True)

    # syn user rating with user-item matrix
    user_df = pd.DataFrame(user_rating, index=[0])
    user_t = user_df.T.reset_index()
    user_movie_entries = list(user_t['index'])
    movie_titles = list(user_title.index.values)
    parsed_title = [match_movie_title(title, movie_titles) for title in user_movie_entries]
    user_df.columns = parsed_title
    user_df = user_df.T
    user_df.rename_axis(index='title', inplace=True)

    # remove null values
    all_user_info = user_title.join(user_df)
    user = all_user_info[0]
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    imputer.fit(all_user_info.T)
    impute_transform = imputer.transform(all_user_info.T)
    new_df = pd.DataFrame(impute_transform.T, columns=all_user_info.columns, index=all_user_info.index.values)

    # run nmf
    nmf = NMF(n_components=5, max_iter=1000)
    nmf.fit(new_df)
    P = nmf.transform(new_df)
    Q = nmf.components_
    Q_df = pd.DataFrame(Q, columns=new_df.columns)
    PQ = P.dot(Q)
    PQ_df = pd.DataFrame(PQ, columns=new_df.columns, index=new_df.index.values)

    #recommendation
    top_20 = PQ_df[0].sort_values(ascending = False).head(20)
    recommend_5 = top_20.sample(frac = 1.0).head(5)
    recommend_nmf = list(recommend_5.index.values)
    
    return recommend_nmf

In [139]:
recommend_with_NMF(user_mat, user_rating, dictionary, k=5)



['Center Stage',
 'Hedwig and the Angry Inch',
 'Planet Terror',
 'River Runs Through It, A',
 'Highlander']