In [190]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import operator
import scipy.spatial.distance as distance
import sklearn.metrics
import random

In [191]:
df = pd.read_pickle('processed_df.pkl')
ratings = pd.read_pickle('ratings_sample.pkl')
ratings = ratings.reset_index()

In [192]:
movies = df.drop(columns = ['movieId', 'title_eng', 'year'])

## Build Recommendations for 1 User
To Do:
- Extend to all users (function) -done
- Test/train split so hold out some recommendations to evaluate against --> evaluate

Extensions:
- Include year (decade)?
- Include text from description or genome tags
- Downweight older ratings 

In [89]:
# limit to one user
ratings_user = ratings[ratings.userId == 100]
movies_user = df[df.movieId.isin(ratings_user.movieId.unique())]

# keep track of movies already watched
watched = movies_user.movieId.unique()

movies_user = movies_user.drop(columns = ['movieId', 'title_eng', 'year'])

In [90]:
# normalize ratings: subtract mean rating  from ratings
# if rating < mean, normalized rating will be negative. Which is worse than 0 aka not rating movie at all.

#### should this be averaged??

mean_rating = np.mean(ratings_user.rating)
ratings_user.rating = ratings_user.rating - mean_rating

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [73]:
# create user profile: multiply item profile by user ratings --> sum of ratings for each attribute 
profile = movies_user.T.dot(ratings_user.rating.values)

In [74]:
# cosine similarity between movie and user profile 
movies = df.drop(columns = ['movieId', 'title_eng', 'year'])
recommendations = sklearn.metrics.pairwise.cosine_similarity(movies, np.asmatrix(profile.values))

In [75]:
recommendations = pd.DataFrame(recommendations)
recommendations.columns = ['prediction']

# merge to get title
recommendations = pd.merge(recommendations, df[['title_eng', 'movieId']], left_index = True, right_index = True, how = 'left')

# sort 
recommendations = recommendations.sort_values('prediction', ascending = False)

In [76]:
# remove recommendations already watched
recommendations = recommendations[~recommendations.movieId.isin(watched)]

In [77]:
# movies with the same prediction -- should we secondarily sort by overall popularity of the movie?
recommendations[:10]

Unnamed: 0,prediction,title_eng,movieId
13773,0.390358,Cargo (2009),77795.0
41581,0.390358,Magellan (2017),191217.0
33414,0.390358,Reality XL (2012),161139.0
41529,0.390358,7 Splinters in Time (2018),190953.0
42148,0.390358,Alien Code (2017),193683.0
28470,0.390358,Narcopolis (2014),143255.0
26577,0.390358,Moebius (1996),136712.0
41847,0.388752,Iru Mugan (2016),192335.0
35694,0.36264,Pornography: A Thriller (2009),169162.0
36599,0.36264,The Man Who Was Thursday (2016),172389.0


In [154]:
def user_content_recommendations(user_id):   
    """
    ratings_user: limit to one user
    
    movies_user: movies rated by that user
    
    watched: keep track of movies already watched
    
    normalize ratings: subtract mean rating  from ratings
                       if rating < mean, normalized rating will be negative. Which is worse than 0 aka not rating movie at all.
    
    profile:create user profile: multiply item profile by user ratings --> sum of ratings for each attribute 
    
    recommendations: cosine similarity between movie and user profile 
                     merge to get title
                     sort
                     remove recommendations already watched
    """

    ratings_user = ratings[ratings.userId == user_id]
    movies_user = df[df.movieId.isin(ratings_user.movieId.unique())]
    watched = movies_user.movieId.unique()
    movies_user = movies_user.drop(columns = ['movieId', 'title_eng', 'year'])
    
    mean_rating = np.mean(ratings_user.rating)
    ratings_user.rating = ratings_user.rating - mean_rating

    profile = movies_user.T.dot(ratings_user.rating.values)
    
    movies = df.drop(columns = ['movieId', 'title_eng', 'year'])
   
    recommendations = sklearn.metrics.pairwise.cosine_similarity(movies, np.asmatrix(profile.values))
    recommendations = pd.DataFrame(recommendations)
    recommendations.columns = ['prediction']
    recommendations = pd.merge(recommendations, df[['title_eng', 'movieId']], left_index = True, right_index = True, how = 'left')
    recommendations = recommendations.sort_values('prediction', ascending = False)
    recommendations = recommendations[~recommendations.movieId.isin(watched)]
    return recommendations

def user_top(movies,n):
    return movies['title_eng'][:n]

def user_movie_id(movies,n):
    return movies['movieId'][:n]

### Getting top 10 and top 5 recommendation for user_id = 100

In [101]:
userId=100
recommendation = user_content_recommendations(userId)

In [116]:
# Get the top 10 Movie recommendation for the user
user_top(recommendation,10)

13773                       Cargo (2009)
41581                    Magellan (2017)
33414                  Reality XL (2012)
41529         7 Splinters in Time (2018)
42148                  Alien Code (2017)
28470                  Narcopolis (2014)
26577                     Moebius (1996)
41847                   Iru Mugan (2016)
35694     Pornography: A Thriller (2009)
36599    The Man Who Was Thursday (2016)
Name: title_eng, dtype: object

In [118]:
# Get the top 5 Movie recommendation for the user
user_top(recommendation,5)

13773                  Cargo (2009)
41581               Magellan (2017)
33414             Reality XL (2012)
41529    7 Splinters in Time (2018)
42148             Alien Code (2017)
Name: title_eng, dtype: object

### Personalization based Evaluation



In [280]:
"""
    unique_users: getting unique user set 
    n: number of users
    users_list: List of random n users
    top_n: top number of recommendations
    columns: top_n recommendations
    users_prediction:  top_n recommendations for n users
"""
unique_users = set(ratings.userId)
n = 5
users_list = random.sample(unique_users, n)
top_n=10
column_names = list(range(top_n))
users_prediction = pd.DataFrame(columns = column_names)

In [286]:
"""
Function retuens users top_n predictions
"""
def get_users_prediction(users_list,top_n,users_prediction):
    for i in users_list:
        recommendation = user_content_recommendations(i)
        prediction = user_movie_id(recommendation,top_n).astype(int).values
        users_prediction = users_prediction.append(pd.Series(prediction),ignore_index=True)
        
    return users_prediction

def user_matrix(df):
    data_melt= pd.DataFrame(data=users_pred).reset_index().melt(id_vars='index', value_name='movieId',)
    data_melt = data_melt[['index', 'movieId']].pivot(index='index', columns='movieId', values='movieId')
    cols = data_melt.columns
    for i in cols:
        data_melt[i] = np.where(data_melt[i].isna(), 0, 1)
    return data_melt

def personalization(users_matrix):
    
    users_sim = sklearn.metrics.pairwise.cosine_similarity(users_matrix)
    return users_sim

In [283]:
import time
start_time = time.time()

# getting predictions for sampled users
users_pred = get_users_prediction(users_list,top_n,users_prediction)

# getting user by movies matrix with binary indicators 0: movie not recommended, 1: movie got recommended
users_matrix = user_matrix(users_pred)
end_time = time.time() - start_time
print(f'Time take {end_time} seconds')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Time take 49.98124980926514 seconds


In [289]:
personalization(users_matrix)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])