In [1]:
import pandas as pd
import numpy as np
import h5py
import random
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def common_scale(val, new_range, pd_series):
    '''
    Function will scale element of pandas series to new range
    
    arg1 (int)    : value
    arg2 (list)  : [Min, Max]
    arg3 (series) : Pandas series to apply function to
    
    Return: the new scaled valued
    '''
    s_range = [pd_series.min(),pd_series.max()]
    return round(np.diff(new_range)[0] * ((val - s_range[0])/(np.diff(s_range)[0])) + 1,2)

In [3]:
# Mean Centering

def mean_center_rows(df):
    '''
    Function to center a dataset based on mean
    
    arg1 (df): Dataframe
    '''
    return (df.T - df.mean(axis=1)).T

In [4]:
def gen_new_user_rating():
    '''
    Function to generate 5 sample titles from subset titles
    
    '''
    
    # Pull in datasets
    with pd.HDFStore('datasets/movie.hdf5') as store:
        rt = store['subset_title_ids']
        titles = store['titles_series']

    # Randomly genreate 5 titles for user to rate
    sampling_title_list = random.sample(rt.to_list(),5)
    
    # Filter dataframe
    st_df = titles.loc[sampling_title_list,:]
    
    # Rename index header name
    st_df.index.name = 'title_id'
    
    st_df.reset_index(inplace=True)
    
    return st_df

In [6]:
# Ask for user input and convert input into a list
sample_df = gen_new_user_rating()
sample_titles = sample_df['title'].to_list()

user_rate = input(f'Please rate the following 5 titles with comma: {sample_titles}')
user_rate = [int(x) for x in user_rate.split(",")]

# Create a user_dict from title and user's input
user_res = dict(zip(sample_df['title_id'],user_rate))

Please rate the following 5 titles with comma: ['real genius', 'the long', 'embrace of the vampire', 'friday the 13th: part 6: jason lives', 'best of the best'] 5,3,2,1,4


In [7]:
user_res

{7483: 5, 8511: 3, 11908: 2, 15362: 1, 6372: 4}

In [8]:
def find_next_closest_user(user_res, user_res_loc, users_df_name, users_loc):
    '''
    Function to find next closest user
    '''
    with pd.HDFStore(users_loc) as store:
        users = store[users_df_name]
    
    # Create a user series with index as -1 and append to user-title matrix
    to_append = pd.Series(data=user_res, index=user_res,name=-1)
    
    # Make a copy so as not to contiminate original dataframe; append to user-title matrix
    user1 = users.copy()
    user1 = user1[list(user_res.keys())]
    user1 = user1.append(to_append)
    
    # Drop users that have no ratings for the chosen nth titles
    user1 = user1[list(user_res.keys())].dropna(axis=0, how='all')

        # Mean Centering the ratings
    user1_mc = mean_center_rows(user1)
    user1_mc = user1_mc.fillna(0)

    # Create User sim matrix
    user1_sim = cosine_similarity(user1_mc)
    user1_sim = pd.DataFrame(user1_sim, columns=user1_mc.index, index = user1_mc.index)

    # Find the next most similar user to user other than user
    next_sim = user1_sim[[-1]].sort_values(by=-1, ascending=False).iloc[1].name

    return next_sim

In [9]:
%%time
next_user = find_next_closest_user(user_res, "",'users','datasets/movie.hdf5')
next_user

Wall time: 5.9 s


2152273

## Collaborative Filtering Recommendation System

### Collaborative Filtering Function

In [10]:
def cf_rec(hdf5_loc, user_id, top_rec=5):
    '''
    Function will put the following stored dataframe/series from HDF5, series must be in the same name: 
    1. users         => dataframe containing the user/title utility Matrix
    2. user_sim      => dataframe containing the user cosine similiarity matrix
    3. titles_series => Series of title and movie_id
    
    Arguments
    arg1 (str) : HDF5 File location
    arg2 (int) : User_id
    arg3 (int) : Top nth number of recommendation
    
    retun:
    DataFrame with movie_id, title, composite score
    '''
    
    # Import user rating file
    with pd.HDFStore(hdf5_loc) as store:
        users_sim = store['users_sim']
        users = store['users']
        index_title_dict = store['titles_series'].to_dict()['title']
    
    # Find user's similarity scores against users have positive similarity
    users_sim = users_sim[user_id].drop(user_id)
    users_sim = users_sim[users_sim > 0]
    
    # Convert user's score to weight; A 1-D array of weights
    user_weight = users_sim.values/np.sum(users_sim.values)
    
    # Find title ratings of titles which user has not rate; A 2-D array of ratings bounded by user-unwatch titles & users that are similar
    titles_rating = users.T
    titles_rating = titles_rating[titles_rating[user_id].isnull()]
    titles_rating = titles_rating.drop(user_id, axis=1)
    titles_rating = titles_rating[users_sim.index]
    
    # Find the score of titles which user has not rate; A 1-D array of composite ratings
    user_rating = np.dot(titles_rating.fillna(0).values, user_weight)
    
    # convert to a dataframe
    user_rec_df = pd.DataFrame(user_rating, index=titles_rating.index,
                               columns=['composite_rating']).sort_values(by='composite_rating', ascending=False).head(top_rec)
    
    # reset index to get movie_id as 1 of the column
    user_rec_df.reset_index(inplace=True)
    
    # Map to find the title
    user_rec_df['title'] = user_rec_df['movie_id'].map(index_title_dict)
    user_rec_df = user_rec_df[['movie_id', 'title','composite_rating']]
    user_rec_df.rename({'movie_id':'title_id'}, axis=1, inplace=True)
    
    # Apply Standard Rating Scaling
    user_rec_df['composite_rating'] = user_rec_df['composite_rating'].apply(common_scale, args=([1,5],user_rec_df['composite_rating']))
    
    # Return a dataframe of the titles recommendation
    return user_rec_df

In [11]:
%%time
cf_rec('datasets/movie.hdf5', next_user, 5)

Wall time: 13.5 s


Unnamed: 0,title_id,title,composite_rating
0,6037,the bourne identity,5.0
1,2372,the bourne supremacy,2.51
2,3605,the wizard of oz: collector's edition,1.98
3,571,american beauty,1.62
4,1180,a beautiful mind,1.0


### Title Based Recommendation Functions

In [12]:
def title_weighter(users_df, title_sim, movies_sim, user_id, title_id):
    
    '''
    Function: Return the title name and its weights to other similar title
    
    '''
    title_sim = movies_sim[title_id].drop(title_id)
    title_sim = title_sim[title_sim > 0]
        
    # Find the weight of each title similarity
    title_weights = title_sim.values/np.sum(title_sim.values)

    # Filter by title similarity then but those that have user's rating
    user_ratings = users_df.T[user_id].loc[title_sim.index].fillna(0)

    # Find the composite rating of the single unwatch title
    return (title_id, np.dot(user_ratings.values, title_weights))  

In [13]:
def title_based_recommender(hdf5_loc, user_res, user_id, top_rec=5):
    '''
    Function
    
    '''
    # Top_rec dataframe containers
    chunks = []
    
    # Import user rating file
    with pd.HDFStore(hdf5_loc) as store:
        movies_sim = store['movies_sim']
        users = store['users']
        index_title_dict = store['titles_series'].to_dict()['title']

    # Loop through user_response
    for mv_id, rating in user_res.items():
        
        # Find list of titles that are positively similar to the title
        title_sim = movies_sim[mv_id].drop(mv_id)
        title_sim = title_sim[title_sim > 0]
        
        # create a list of titles to iterate 
        n_watch = list(title_sim.sort_values(ascending=False).head(20).index)
        
        # Return a list of recommendation for each title that user have not watch
        ls_rec = [title_weighter(users, title_sim, movies_sim, user_id, title) for title in n_watch]
                        
        # Create a dataframe of recommendations
        com_rec_df = pd.DataFrame(data=ls_rec, columns=['movie_id', 'composite_rating']).sort_values(by='composite_rating', ascending=False).head(top_rec)
        
        # Apply User's rating weight
        com_rec_df['composite_rating'] = com_rec_df['composite_rating'] * rating/len(user_res)
        
        # Append to rec_df container
        chunks.append(com_rec_df)
    
    # Concat dataframe together
    user_rec_df = pd.concat(chunks,ignore_index=True)
    
    # Map to find the title
    user_rec_df['title'] = user_rec_df['movie_id'].map(index_title_dict)
    user_rec_df = user_rec_df[['movie_id', 'title','composite_rating']]
    user_rec_df.rename({'movie_id':'title_id'}, axis=1, inplace=True)
    
    # Apply standard scaling
    user_rec_df['composite_rating'] = user_rec_df['composite_rating'].apply(common_scale, args=([1,5],user_rec_df['composite_rating']))
    
    # Sort & Remove Duplicates
    user_rec_df.sort_values(by='composite_rating', ascending=False, inplace=True)
    user_rec_df.drop_duplicates(subset='title', keep='first', inplace=True)
    
    # Return a dataframe of the titles recommendation
    return user_rec_df.head(top_rec)

In [14]:
%%time
title_based_recommender('datasets/movie.hdf5', user_res,next_user, 5)

Wall time: 1.77 s


Unnamed: 0,title_id,title,composite_rating
0,705,major league,5.0
1,6716,spies like us,4.94
2,10212,weird science,4.93
3,7779,willow,4.91
4,4761,the cannonball run,4.89


<span style= 'color:magenta'>Remark:</span>
Comparing the results from both Collaborative Filtering RS & Item-Based RS, the recommendated list seems similar. This may indicate that most system are able to identify quite similarily what the user wants OR it could be that the sampled user has a very skewed watch-list eg. only watch a particular genre and nothing else hence resulting in similar recommendation. I will be creating a Neural Network Content-Based RS and see if we can get similar results.

##  Keyword Based Recommendation System

In [15]:
def find_similar(name, n = 10):
    """Find n most similar items (or least) to name based on embeddings. Option to also plot the results"""
    
    
    # Retrieve pre-calculated title weights
    with h5py.File('datasets/movie_NN.hdf5','r') as hf:
        weights = hf['title_weights'][:]
        
    with pd.HDFStore('datasets/movie.hdf5') as store:
        idx_t_ids_dict = store['idx_title_series'].to_dict()
        index_title_dict = store['titles_series'].to_dict()['title']
    
    t_ids_idx_dict = {item : key for key,item in idx_t_ids_dict.items()}
    
    # Create a list container to hold calculated dictionary
    ls_of_dicts = []
    
    # Select index and reverse index
    index_name = 'title'
    index = t_ids_idx_dict
    rindex = idx_t_ids_dict

    # Check to make sure `name` is in index
    try:
        # Calculate dot product between book and all others
        dists = np.dot(weights, weights[index[name]])
    except KeyError:
        print(f'{name} Not Found.')
        return
    
    # Sort distance indexes from smallest to largest
    sorted_dists = np.argsort(dists)
    
    # Take the last n sorted distances; remove the title itself
    closest = sorted_dists[-(n+1):][:-1]
    
    ls_of_dicts = [{'title': rindex[c], 'composite_rating': dists[c]} for c in reversed(closest)]
    
    user_rec_df = pd.DataFrame(ls_of_dicts)
    
    # Map title name to title id
    user_rec_df['title'] = user_rec_df['title'].map(index_title_dict)
    
    return user_rec_df

In [16]:
def keyword_based_recommender(user_res, top_rec=10):
    '''
    Function to run rec algorithms 
    
    '''
    chunks = []
    for mv_id, rating in user_res.items():
        df = find_similar(mv_id, n = 10)
        
        # Apply weights based on user's rating
        df['composite_rating'] = df['composite_rating'] * rating/len(user_res)
        
        # Append dataframe
        chunks.append(df)

    # Concat dataframe together
    user_rec_df = pd.concat(chunks,ignore_index=True)
    
    # Apply standard scaling
    user_rec_df['composite_rating'] = user_rec_df['composite_rating'].apply(common_scale, args=([1,5],user_rec_df['composite_rating']))
    
    user_rec_df.sort_values(by='composite_rating', ascending=False, inplace=True)
    user_rec_df.drop_duplicates(subset='title', keep='first', inplace=True)
    user_rec_df.reset_index(drop=True, inplace=True)
    
    
    return user_rec_df.head(top_rec)

In [17]:
keyword_based_recommender(user_res, 5)

Unnamed: 0,title,composite_rating
0,play it to the bone,5.0
1,bobby jones,4.91
2,shaolin soccer,4.85
3,the best of times,4.84
4,rookie of the year,4.8


## Base Recommendation based on popularity

In [99]:
hf = h5py.File('datasets/movie.hdf5','r')
print(f'Dataframe in movie.hdf5 {hf.keys()}')
hf.close()

Dataframe in movie.hdf5 <KeysViewHDF5 ['genre_series', 'idx_title_series', 'keywords_series', 'movies_sim', 'subset_title_ids', 'titles_series', 'users', 'users_sim']>


In [100]:
hf = h5py.File('datasets/movie_raw.hdf5','r')
print(f'Dataframe in movie.hdf5 {hf.keys()}')
hf.close()

Dataframe in movie.hdf5 <KeysViewHDF5 ['full_ratings', 'titles_cleaned', 'titles_search_mvid']>


In [101]:
# Import rating datafile
with pd.HDFStore('datasets/movie_raw.hdf5') as store:
    ratings = store['full_ratings']

# Import subset_title_ids
with pd.HDFStore('datasets/movie.hdf5') as store:
    rt = store['subset_title_ids']
    sub_title = store['titles_series']

In [61]:
rt.head()

0     3
1    16
2    17
3    18
4    26
dtype: int16

In [62]:
ratings.head()

Unnamed: 0,customer_id,rating,date,movie_id
0,1488844,3,2005-09-06,1
1,822109,5,2005-05-13,1
2,885013,4,2005-10-19,1
3,30878,4,2005-12-26,1
4,823519,3,2004-05-03,1


In [63]:
# Find ratings for titles that are in my subset
ratings = ratings[ratings['movie_id'].isin(rt)]

# Find titles in subsets that have ratings >= 3
ratings = ratings[ratings['rating'] >=3]

ratings.head()

Unnamed: 0,customer_id,rating,date,movie_id
692,1025579,4,2003-03-29,3
693,712664,5,2004-02-01,3
694,1331154,4,2004-07-03,3
695,2632461,3,2005-07-22,3
696,44937,5,2004-06-22,3


In [64]:
#  Find the count of positive ratings for each movie
ratings_series = ratings.groupby('movie_id')['customer_id'].count()

In [70]:
# Create a dataframe of the popular titles
popular_titles = pd.DataFrame(ratings_series).reset_index().rename({'movie_id': 'title_id', 'customer_id':'num_ratings'}, axis=1)

# merge popular titles with additional information
popular_titles = pd.merge(popular_titles, sub_title, how='inner',left_on='title_id', right_on='movie_id')
popular_titles.head()

Unnamed: 0,title_id,num_ratings,title,genre
0,3,1786,character,"[History, Drama]"
1,16,2030,screamers,"[Horror, Science Fiction]"
2,17,4843,7 seconds,"[Action, Crime, Thriller]"
3,18,9830,immortal beloved,"[Drama, Music, Romance]"
4,26,3645,never die alone,"[Action, Crime, Drama, Thriller]"


## Conclusion

From the above, we can see that all recommendations systems provide different titles based on different aspect of the understanding the user. An ensemble model was created to find the best title recommendation from each of the sub-recommendating system. This will give a more generalized titles recommendations to user. 

Comparing to the the base recommendations - popularity titles, we see that our recommendations systems pushes titles that are more relating to the user's preference. In addition to this, the systems in its entirety has achieved to eliminate cold start problem where new users may be recommended something that are not to their taste.

## Limitations
Limitation of data available. Initially API-calls to MovieDB was supposed to be the panacea to the limited information available. However, MoiveDB do not have an exact match function but give the best movie hence I have to discards some of my data. In total, I have to reduce close to about 16% of my data.

The traditional machine learning models based recommendation systems - Collaborative Filtering and Title-Based are computationally expensive. This limits me to use a subset in order to generate the user matrix. Hence the recommendations may not be as comprehensive in terms of the titles to push to users. In additional, for every user, it need to re-calculate the user matrix again hence reducing the user experience when they need to wait longer time to find titles that are suitable to their taste.

*Neural Network (Meta-data rec system) although eliminate the problems experienced by traditional ML but it suffers from longer training time when we runs more epochs and more embedded layers. However, it do not suffers the long calculation time required to generate recommendations to users.

Each systems have their pros and cons and the bottlenecks occurs at different sections. However, storing preference is for Neutral Networks because:
- Heavy lifting is during the model training phase hence the on the user-front, there is close to little lag in pushing titles to users.
- Additional of inputs/variables are an additional layers into the Neural Network. It is relative easier to perform features "top-up" compared to traditional MLs.

## Recommendations

Given the short turn-around time, evaluation of the recommendation system is limited to referencing to the popular titles in the database. The next steps is to explore deeper into evaluation metrics using confusion matrix metric to find how each system are recommending titles correctly using created training datasets (created as part of training for neural network)

Another direction is to look into TDIDF of meta-data. This will give a aggregated weight matrix of meta-data across all the titles. Using this matrix in the Neural Network might give better recommendation as it takes into account of weight of each keyword the title contains. Currently, each keywords is assumed to have the same weight.