    
    MOVIE RECOMMENDATION ENGINE
    
        Objective: 
            * Given a user 'x' and an unrated movie 'y', can we predict how the user will rate this movie?
            * Using the rating, can we recommend similar movies?
        
        Data:
            * Movie dataset (details about the movie)
            * Ratings dataset (movie ratings)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data manipulation

In [2]:
movies = pd.read_csv('../ml-latest-small/movies.csv')
ratings = pd.read_csv('../ml-latest-small/ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [71]:
# Merge movies & ratings dataset
movie_ratings = pd.merge(movies, ratings, on='movieId').drop(labels='timestamp', axis=1)

In [16]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5


    # FEATURE EXTRACTION
    
        * Global Average Rating: Average rating of all movies by all users.
        * Movie Rating by all user: Average rating of a particular movie given by all users.
        * All movies by user: Avaerage rating of all movies by a particular user.

In [40]:
# Group by 'movieId' and 'userId' and aggregate by mean
movie_user_Id_Rating = movie_ratings.groupby(by=['movieId', 'userId']).agg(np.mean)

# Calculate mean of rating to get mean of all movies by all users
avg_global_rating = movie_user_Id_Rating['rating'].sum() / movie_user_Id_Rating['rating'].shape[0]

print("Average rating of all movies by all users: {}".format(np.round(avg_global_rating, decimals=1)))

Average rating of all movies by all users: 3.5


In [65]:
# Group by 'movieId' and aggregate by mean
movie_avg = movie_ratings.groupby('movieId').agg(np.mean).loc[:, ['rating']]
movie_avg.columns = ['movie_avg_rating']

In [67]:
# Group by 'userId' and aggregate by mean
user_avg = movie_ratings.groupby('userId').agg(np.mean).loc[:, ['rating']]
user_avg.columns = ['user_avg_rating']

In [72]:
# Global AVG Columns
movie_ratings['global_avg_rating'] = avg_global_rating

In [74]:
# Merging 'movie_ratings' with 'movie_avg'
movie_ratings = pd.merge(movie_ratings, movie_avg, left_on='movieId', right_index=True)

In [78]:
# Merging 'movie_ratings' with 'user_avg'
movie_ratings = pd.merge(movie_ratings, user_avg, left_on='userId', right_index=True)

In [121]:
movie_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,global_avg_rating,movie_avg_rating,user_avg_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,3.501557,3.92093,4.366379
325,3,Grumpier Old Men (1995),Comedy|Romance,1,4.0,3.501557,3.259615,4.366379
433,6,Heat (1995),Action|Crime|Thriller,1,4.0,3.501557,3.946078,4.366379
2107,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1,5.0,3.501557,3.975369,4.366379
2379,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1,5.0,3.501557,4.237745,4.366379


        * User-to-user Cosine Similarity to find top 5 similar users
        * Item-to-item Cosine Similarity to find top 5 similar movies

In [94]:
# Pivot table of ratings as values, userId as index, and title as columns
movie_user_id_pvt_table = pd.pivot_table(movie_ratings, values='rating', index='userId', columns='title').fillna(0)

In [95]:
movie_user_id_pvt_table

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScalar
scalar = StandardScaler()

# Fit and Transform the data
movie_ratings_std = scalar.fit_transform(movie_user_id_pvt_table)

In [112]:
# Movie-to-movie collaborative filtering
cosine_sim_items = cosine_similarity(movie_ratings_std.T)

# User-to-user collaborative filtering
cosine_sim_users = cosine_similarity(movie_ratings_std)

In [115]:
# DataFrame M-to-M CF
cosine_sim_items_df = pd.DataFrame(cosine_sim_items, index=movie_user_id_pvt_table.columns, \
                                   columns=movie_user_id_pvt_table.columns)

# DataFrame U-toU CF
cosine_sim_users_df = pd.DataFrame(cosine_sim_users, index=movie_user_id_pvt_table.index, \
                                   columns=movie_user_id_pvt_table.index)

In [118]:
cosine_sim_items_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,-0.001642,-0.002324,-0.001642,-0.002254,-0.001642,-0.006407,-0.001642,0.135943,-0.004325,...,-0.001642,0.339935,0.542247,0.706526,-0.001642,-0.007675,0.134327,0.325287,-0.008185,-0.001642
'Hellboy': The Seeds of Creation (2004),-0.001642,1.0,0.706526,-0.001642,-0.002254,-0.001642,-0.006407,-0.001642,-0.010568,-0.004325,...,-0.001642,-0.004589,-0.002808,-0.002324,-0.001642,-0.007675,-0.007744,-0.003594,-0.008185,-0.001642
'Round Midnight (1986),-0.002324,0.706526,1.0,-0.002324,-0.003191,-0.002324,0.170199,-0.002324,-0.014958,-0.006121,...,-0.002324,-0.006495,-0.003975,-0.003289,-0.002324,-0.010863,-0.010961,-0.005087,-0.011585,-0.002324
'Salem's Lot (2004),-0.001642,-0.001642,-0.002324,1.0,0.857269,-0.001642,-0.006407,-0.001642,-0.010568,-0.004325,...,-0.001642,-0.004589,-0.002808,-0.002324,-0.001642,-0.007675,-0.007744,-0.003594,-0.008185,-0.001642
'Til There Was You (1997),-0.002254,-0.002254,-0.003191,0.857269,1.0,-0.002254,-0.008797,-0.002254,-0.01451,-0.005938,...,-0.002254,-0.006301,-0.003856,-0.003191,-0.002254,-0.010538,-0.010632,-0.004935,-0.011238,-0.002254


In [268]:
# Dictionary to store ratings for similar movies
similar_movies_ratings = {
    'sim_movie_1': list(),
    'sim_movie_2': list(),
    'sim_movie_3': list(),
    'sim_movie_4': list(),
    'sim_movie_5': list(),
    'movieId': list()
}

# List to reference titles
similar_movies_title = ['sim_movie_1', 'sim_movie_2', 'sim_movie_3', 'sim_movie_4', 'sim_movie_5', 'movieId']

In [331]:
# Dictionary to store ratings for similar users
similar_user_ratings = {
    'sim_user_1': list(),
    'sim_user_2': list(),
    'sim_user_3': list(),
    'sim_user_4': list(),
    'sim_user_5': list(),
    'userId': list()
}

# List to reference users
similar_users_title = ['sim_user_1', 'sim_user_2', 'sim_user_3', 'sim_user_4', 'sim_user_5', 'userId']

In [310]:
# HELPER FUNCTIONS

def get_ratings_similar_movies(data, main_feature, cosine_sim_dataset, ratings_dict, dict_keys):
    """
    :param data: DataFrame containing movie ratings
    :param main_feature: Name of feature name... Either 'title of movie' or 'user id'
    :param cosine_sim_dataset: DataFrame of cosine similarity scores
    :param ratings_dict: Dictionary to store ratings
    :param dict_keys: List of dictionary keys
    """
    
    # Unique Movies from data
    unique_items = data[main_feature].unique()
    
    for item in unique_items:
        
        # Sort similar movies
        similar_items = sorted(list(enumerate(cosine_sim_dataset[item])), \
                                key=lambda x:x[1], reverse=True)[1:6]

        # _________________________
        if main_feature == 'title':
            # MovieID
            _m_id = data[data.title == item]['movieId'].values[0]
            ratings_dict['movieId'].append(_m_id)
            
        elif main_feature == 'userId':
            _m_id = data[data.userId == item]['userId'].values[0]
            ratings_dict['userId'].append(_m_id)
        # _________________________
        
        
        count = 0

        # Loop through similar movies to get the index
        for index, cosine_score in similar_items:

            _m = cosine_sim_dataset[[item]].index[index]

            _m_info = data[data[main_feature] == _m]

            if _m_info.shape[0] > 1:
                rating = float(round(_m_info['rating'].mean()))

            else:
                rating = _m_info['rating'].values[0]

            ratings_dict[dict_keys[count]].append(rating)

            count += 1
        
    return ratings_dict

In [277]:
similar_movies_ratings = get_ratings_similar_movies(
    data=movie_ratings,
    main_feature='title',
    cosine_sim_dataset=cosine_sim_items_df, 
    ratings_dict=similar_movies_ratings, 
    dict_keys=similar_movies_title
)

In [332]:
similar_user_ratings = get_ratings_similar_movies(
    data=movie_ratings, 
    main_feature='userId', 
    cosine_sim_dataset=cosine_sim_users_df, 
    ratings_dict=similar_user_ratings, 
    dict_keys=similar_users_title
)

In [318]:
# Merge "similar movies rating" with "movie ratings"
movie_ratings = pd.merge(movie_ratings, pd.DataFrame(similar_movies_ratings), on='movieId')

In [342]:
# Merge "similar users rating" with "movie ratings"
movie_ratings = pd.merge(movie_ratings, pd.DataFrame(similar_user_ratings))

In [345]:
# Saving DataFrame as csv file
movie_ratings.to_csv("movie_ratings_features.csv", index=False)