In [1]:
import pandas as pd
import numpy as np
import h5py
import random
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def common_scale(val, new_range, pd_series):
    '''
    Function will scale element of pandas series to new range
    
    arg1 (int)    : value
    arg2 (list)  : [Min, Max]
    arg3 (series) : Pandas series to apply function to
    
    Return: the new scaled valued
    '''
    s_range = [pd_series.min(),pd_series.max()]
    return round(np.diff(new_range)[0] * ((val - s_range[0])/(np.diff(s_range)[0])) + 1,2)

In [3]:
# Mean Centering

def mean_center_rows(df):
    '''
    Function to center a dataset based on mean
    
    arg1 (df): Dataframe
    '''
    return (df.T - df.mean(axis=1)).T

In [4]:
# # Import retained title series
# with pd.HDFStore('datasets/movie.hdf5') as store:
#     rt = store['subset_title_ids']
#     index_title_dict = store['titles_series'].to_dict()['title']

# # Randomly genreate 5 titles for user to rate
# user_test_list = random.sample(rt.to_list(),5)
# user_test_name = [index_title_dict[i] for i in user_test_list]

# # Ask for user input and convert input into a list
# user_rate = input(f'Please rate the following 5 titles with comma: {user_test_name}')
# user_rate = [int(x) for x in user_rate.split(",")]

In [5]:
def gen_new_user_rating():
    '''
    Function to generate 5 sample titles from subset titles
    
    '''
    
    # Pull in datasets
    with pd.HDFStore('datasets/movie.hdf5') as store:
        rt = store['subset_title_ids']
        index_title_dict = store['titles_series'].to_dict()['title']

    # Randomly genreate 5 titles for user to rate
    sampling_title_list = random.sample(rt.to_list(),5)
    
    # Create dataframe with title_id & title name
    st_df = pd.DataFrame(sampling_title_list, columns=['title_id'])
    st_df['title'] = st_df['title_id'].map(index_title_dict)
    
    return st_df

In [6]:
# Ask for user input and convert input into a list
sample_df = gen_new_user_rating()
sample_titles = sample_df['title'].to_list()

user_rate = input(f'Please rate the following 5 titles with comma: {sample_titles}')
user_rate = [int(x) for x in user_rate.split(",")]

# Create a user_dict from title and user's input
user_dict = dict(zip(sample_df['title_id'],user_rate))

Please rate the following 5 titles with comma: ['patch adams', "st. elmo's fire", 'toy story', 'winning london', 'flying leathernecks'] 5,4,3,2,1


In [7]:
list(user_dict.keys())

[4123, 6050, 13673, 5896, 11075]

In [None]:
def find_next_closest_user(user_res, user_res_loc, users, users_loc):
    '''
    Function to find next closest user
    '''
    with pd.HDFStore(users_loc) as store:
    users = store[users]
    
    # Create a user series with index as -1 and append to user-title matrix
    to_append = pd.Series(data=user_dict, index=user_dict,name=-1)
    
    # Make a copy so as not to contiminate original dataframe; append to user-title matrix
    user1 = users.copy()
    user1 = user1[user_test_list]
    user1 = user1.append(to_append)

    

In [85]:
%%time

with pd.HDFStore('datasets/movie.hdf5') as store:
    users = store['users']



# Create a user series with index as -1 and append to user-title matrix
to_append = pd.Series(data=user_dict, index=user_dict,name=-1)

# Make a copy so as not to contiminate original dataframe; append to user-title matrix
user1 = users.copy()
user1 = user1[user_test_list]
user1 = user1.append(to_append)

# Drop users that have no ratings for the chosen nth titles
user1 = user1[user_test_list].dropna(axis=0, how='all')

# Mean Centering the ratings
user1_mc = mean_center_rows(user1)
user1_mc = user1_mc.fillna(0)

# Create User sim matrix
user1_sim = cosine_similarity(user1_mc)
user1_sim = pd.DataFrame(user1_sim, columns=user1_mc.index, index = user1_mc.index)

# Find the next most similar user to user other than user
next_sim = user1_sim[[-1]].sort_values(by=-1, ascending=False).iloc[1].name

print (next_sim)

2188622
Wall time: 13.7 s


## Collaborative Filtering Recommendation System

### Collaborative Filtering Function

In [7]:
def cf_rec(hdf5_loc, user_id, top_rec=5):
    '''
    Function will put the following stored dataframe/series from HDF5, series must be in the same name: 
    1. users         => dataframe containing the user/title utility Matrix
    2. user_sim      => dataframe containing the user cosine similiarity matrix
    3. titles_series => Series of title and movie_id
    
    Arguments
    arg1 (str) : HDF5 File location
    arg2 (int) : User_id
    arg3 (int) : Top nth number of recommendation
    
    retun:
    DataFrame with movie_id, title, composite score
    '''
    
    # Import user rating file
    with pd.HDFStore(hdf5_loc) as store:
        users_sim = store['users_sim']
        users = store['users']
        index_title_dict = store['titles_series'].to_dict()['title']
    
    # Find user's similarity scores against users have positive similarity
    users_sim = users_sim[user_id].drop(user_id)
    users_sim = users_sim[users_sim > 0]
    
    # Convert user's score to weight; A 1-D array of weights
    user_weight = users_sim.values/np.sum(users_sim.values)
    
    # Find title ratings of titles which user has not rate; A 2-D array of ratings bounded by user-unwatch titles & users that are similar
    titles_rating = users.T
    titles_rating = titles_rating[titles_rating[user_id].isnull()]
    titles_rating = titles_rating.drop(user_id, axis=1)
    titles_rating = titles_rating[users_sim.index]
    
    # Find the score of titles which user has not rate; A 1-D array of composite ratings
    user_rating = np.dot(titles_rating.fillna(0).values, user_weight)
    
    # convert to a dataframe
    user_rec_df = pd.DataFrame(user_rating, index=titles_rating.index,
                               columns=['composite_rating']).sort_values(by='composite_rating', ascending=False).head(top_rec)
    
    # reset index to get movie_id as 1 of the column
    user_rec_df.reset_index(inplace=True)
    
    # Map to find the title
    user_rec_df['title'] = user_rec_df['movie_id'].map(index_title_dict)
    user_rec_df = user_rec_df[['movie_id', 'title','composite_rating']]
    
    # Apply Standard Rating Scaling
    user_rec_df['composite_rating'] = user_rec_df['composite_rating'].apply(common_scale, args=([1,5],user_rec_df['composite_rating']))
    
    # Return a dataframe of the titles recommendation
    return user_rec_df

In [48]:
%%time
cf_rec('datasets/movie.hdf5', 2637886, 5)

Wall time: 14.4 s


Unnamed: 0,movie_id,title,composite_rating
0,11521,lord of the rings: the two towers,5.0
1,14240,lord of the rings: the return of the king,4.28
2,17157,saving private ryan,3.31
3,9960,die hard,1.64
4,5862,memento,1.0


### Title Based Recommendation Functions

In [100]:
def title_weighter(users_df, title_sim, user_id, title_id):
    
    '''
    Function: Return the title name and its weights to other similar title
    
    '''
    title_sim = movies_sim[title_id].drop(title_id)
    title_sim = title_sim[title_sim > 0]
        
    # Find the weight of each title similarity
    title_weights = title_sim.values/np.sum(title_sim.values)

    # Filter by title similarity then but those that have user's rating
    user_ratings = users_df.T[user_id].loc[title_sim.index].fillna(0)

    # Find the composite rating of the single unwatch title
    return (title_id, np.dot(user_ratings.values, title_weights))  

In [109]:
def title_based_recommender(hdf5_loc, user_res, user_id, top_rec=5):
    '''
    Function
    
    '''
    # Top_rec dataframe containers
    chunks = []
    
    # Import user rating file
    with pd.HDFStore(hdf5_loc) as store:
        movies_sim = store['movies_sim']
        users = store['users']
        index_title_dict = store['titles_series'].to_dict()['title']

    # Loop through user_response
    for mv_id, rating in user_res.items():
        
        # Find list of titles that are positively similar to the title
        title_sim = movies_sim[mv_id].drop(mv_id)
        title_sim = title_sim[title_sim > 0]
        
        # Find list of titles that are not watched by user
        n_watch = list(users.T[user_id].loc[title_sim.index][users.T[user_id].isnull()].index)
        
        # Return a list of recommendation for each title that user have not watch
        ls_rec = [title_weighter(users, title_sim, user_id, title) for title in n_watch]
                        
        # Create a dataframe of recommendations
        com_rec_df = pd.DataFrame(data=ls_rec, columns=['movie_id', 'composite_rating']).sort_values(by='composite_rating', ascending=False).head(top_rec)

        # Apply User's rating weight
        com_rec_df['composite_rating'] = com_rec_df['composite_rating'] * rating/len(user_res)
        
        # Append to rec_df container
        chunks.append(com_rec_df)
    
    # Concat dataframe together
    user_rec_df = pd.concat(chunks,ignore_index=True)
    
    # Map to find the title
    user_rec_df['title'] = user_rec_df['movie_id'].map(index_title_dict)
    user_rec_df = user_rec_df[['title','composite_rating']]
    
    # Apply standard scaling
    user_rec_df['composite_rating'] = user_rec_df['composite_rating'].apply(common_scale, args=([1,5],user_rec_df['composite_rating']))
    
    # Sort & Remove Duplicates
    user_rec_df.sort_values(by='composite_rating', ascending=False, inplace=True)
    user_rec_df.drop_duplicates(subset='title', keep='first', inplace=True)
    
    # Return a dataframe of the titles recommendation
    return user_rec_df

In [110]:
%%time
title_based_recommender('datasets/movie.hdf5', user_dict,2637886, 5)

Wall time: 43.3 s


Unnamed: 0,title,composite_rating
0,lost in translation,5.0
1,memento,4.94
2,saving private ryan,4.76
3,apollo 13,4.69
4,old school,4.69
9,national lampoon's vacation,3.75
24,misery,1.0


In [106]:
%%time
title_based_recommender('datasets/movie.hdf5', user_dict,2637886, 5)

Wall time: 40.7 s


Unnamed: 0,title,composite_rating
0,lost in translation,5.0
5,memento,5.0
20,saving private ryan,5.0
21,apollo 13,2.75
22,old school,2.53
23,national lampoon's vacation,1.58
24,misery,1.0


<span style= 'color:magenta'>Remark:</span>
Comparing the results from both Collaborative Filtering RS & Item-Based RS, the recommendated list seems similar. This may indicate that most system are able to identify quite similarily what the user wants OR it could be that the sampled user has a very skewed watch-list eg. only watch a particular genre and nothing else hence resulting in similar recommendation. I will be creating a Neural Network Content-Based RS and see if we can get similar results.

##  Keyword Based Recommendation System

In [21]:
def find_similar(name, n = 10):
    """Find n most similar items (or least) to name based on embeddings. Option to also plot the results"""
    
    
    # Retrieve pre-calculated title weights
    with h5py.File('datasets/movie_NN.hdf5','r') as hf:
        weights = hf['title_weights'][:]
        
    with pd.HDFStore('datasets/movie.hdf5') as store:
        idx_t_ids_dict = store['idx_title_series'].to_dict()
        index_title_dict = store['titles_series'].to_dict()['title']
    
    t_ids_idx_dict = {item : key for key,item in idx_t_ids_dict.items()}
    
    # Create a list container to hold calculated dictionary
    ls_of_dicts = []
    
    # Select index and reverse index
    index_name = 'title'
    index = t_ids_idx_dict
    rindex = idx_t_ids_dict

    # Check to make sure `name` is in index
    try:
        # Calculate dot product between book and all others
        dists = np.dot(weights, weights[index[name]])
    except KeyError:
        print(f'{name} Not Found.')
        return
    
    # Sort distance indexes from smallest to largest
    sorted_dists = np.argsort(dists)
    
    # Take the last n sorted distances; remove the title itself
    closest = sorted_dists[-(n+1):][:-1]
    
    ls_of_dicts = [{'title': rindex[c], 'composite_rating': dists[c]} for c in reversed(closest)]
    
    user_rec_df = pd.DataFrame(ls_of_dicts)
    
    # Map title name to title id
    user_rec_df['title'] = user_rec_df['title'].map(index_title_dict)
    
    return user_rec_df

In [113]:
def keyword_based_recommender(user_res, top_rec=10):
    '''
    Function to run rec algorithms 
    
    '''
    chunks = []
    for mv_id, rating in user_res.items():
        df = find_similar(mv_id, n = 10)
        
        # Apply weights based on user's rating
        df['composite_rating'] = df['composite_rating'] * rating/len(user_res)
        
        # Append dataframe
        chunks.append(df)

    # Concat dataframe together
    user_rec_df = pd.concat(chunks,ignore_index=True)
    
    # Apply standard scaling
    user_rec_df['composite_rating'] = user_rec_df['composite_rating'].apply(common_scale, args=([1,5],user_rec_df['composite_rating']))
    
    user_rec_df.sort_values(by='composite_rating', ascending=False, inplace=True)
    user_rec_df.drop_duplicates(subset='title', keep='first', inplace=True)
    user_rec_df.reset_index(drop=True, inplace=True)
    
    
    return user_rec_df.head(top_rec)

In [115]:
keyword_based_recommender(user_dict, 5)

Unnamed: 0,title,composite_rating
0,enigma: mcmxc a.d,5.0
1,dragon drive,4.17
2,iron monkey 2,3.33
3,classic albums: meat loaf: bat out of hell,3.18
4,walking with cavemen,3.08


In [139]:
hf = h5py.File('datasets/movie.hdf5','r')
print(f'Dataframe in movie.hdf5 {hf.keys()}')
hf.close()

Dataframe in movie.hdf5 <KeysViewHDF5 ['genre_series', 'idx_title_series', 'keywords_series', 'movies_sim', 'subset_title_ids', 'titles_series', 'users', 'users_sim']>


In [138]:
hf = h5py.File('datasets/movie_raw.hdf5','r')
print(f'Dataframe in movie.hdf5 {hf.keys()}')
hf.close()

Dataframe in movie.hdf5 <KeysViewHDF5 ['full_ratings', 'titles_cleaned', 'titles_search_mvid']>
