## Content Recommendations
keep_movies: allows us to only produce recommendations from a subset of movies such as movies with vs without tags    
Still generate user profiles based on ALL movies so full look at feature preferences


### Version 2
Do not normalize ratings vectors. Only normalize profile. Then multiply matricies
Previously taking cosine similarity and thus normalizing both vectors. This caused longer movie vectors to be penalized, thereby prioritizing movies without any actor and/or director dummies because they were shorter.    

Normalize profile so take into account different # of ratings for different users.   
No need to normalize movies because 0/1 values and having more features is not a negative.  

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import operator
import scipy.spatial.distance as distance
from sklearn import metrics 
import random
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn
import fastparquet
import scipy

In [None]:
def user_content_recommendations(user_id, df, ratings, movieIds, keep_movies = []):  
    """
    ratings_user: limit to one user
    
    movies_user: movies rated by that user
    
    watched: keep track of movies already watched
    
    normalize ratings: subtract mean rating  from ratings
                       if rating < mean, normalized rating will be negative. Which is worse than 0 aka not rating movie at all.
    
    profile:create user profile: multiply item profile by user ratings --> sum of ratings for each attribute 
    
    recommendations: cosine similarity between movie and user profile 
                     merge to get title
                     sort
                     remove recommendations already watched
    """
    ratings_user = ratings[ratings.userId == user_id]
    ratings_user = ratings_user.sort_values('movieId')
    watched = ratings_user.movieId.unique()
    watched_index = [movieIds.index(i) for i in watched]
    movies_user = df[watched_index, :]
        
    mean_rating = np.mean(ratings_user.rating)
    ratings_user.rating = ratings_user.rating - mean_rating
    
    profile = scipy.sparse.csr_matrix(movies_user.T.dot(ratings_user.rating.values))
    
    # normalize profile to account for different numbers of ratings
    profile = sklearn.preprocessing.normalize(profile, axis = 1, norm = 'l2')

    # if specify a subset of movies to keep, limit df and movieIds such that only get recommendations of this type 
    #if len(keep_movies) > 0:
    #    keep_index = [movieIds.index(i) for i in keep_movies]
    #    df = df[keep_index, :]
    #    movieIds = [i for i in movieIds if i in keep_movies]

    # find similarity between profile and movies 
    # cosine similarity except movies not normalized 
    recommendations = df.dot(profile.T).todense()
    
    # merge recommendations back with movie Ids
    recommendations = pd.DataFrame(recommendations)
    recommendations = pd.merge(recommendations, pd.Series(movieIds).to_frame(), left_index = True, right_index = True)
    recommendations.columns = ['prediction', 'movieId']
    
    # remove watched movies 
    recommendations = recommendations[~recommendations.movieId.isin(watched)]
    
    # remove movies not in keep_movies options
    if len(keep_movies) > 0:
        recommendations = recommendations[recommendations.movieId.isin(keep_movies)]
    
    # sort by similarity 
    recommendations = recommendations.sort_values('prediction', ascending = False)

    return recommendations