    Task:
    
    Given a dataset of movies and user ratings on those movies, find similar users and recommend
    new movie watched by one user to another user.

In [1]:
# Import libraries
import numpy as np  # linear algebra
import pandas as pd  # data manipulation & analysis

In [2]:
# Import movie and ratings dataset
movies = pd.read_csv('../ml-latest-small/movies.csv')
ratings = pd.read_csv('../ml-latest-small/ratings.csv')

In [16]:
# Merge movies and ratings dataset
movies_data = pd.merge(movies, ratings, on='movieId')

In [20]:
movies_data.head()

Unnamed: 0,title,userId,rating
0,Toy Story (1995),1,4.0
1,Toy Story (1995),5,4.0
2,Toy Story (1995),7,4.5
3,Toy Story (1995),15,2.5
4,Toy Story (1995),17,4.5


In [18]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   movieId    100836 non-null  int64  
 1   title      100836 non-null  object 
 2   genres     100836 non-null  object 
 3   userId     100836 non-null  int64  
 4   rating     100836 non-null  float64
 5   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


    # Data Preprocessing
    
        * Data cleaning

In [54]:
# Helper Functions

def get_similar_movies(movie_title, rating):
    """
    :param movie_title: Title of movie in engine
    :param rating: Rating given to movie
    """
    
    similar_score = cosine_sim_data[movie_title] * (rating - 2.5)
    return similar_score.sort_values(ascending=False)

In [19]:
# With a collaborate-based filtering recommendation, we are interested in the movie
# user, and rating user gave to a movie; we can drop "timestamp", "genre" & "movieId"
movies_data.drop(labels=['movieId', 'timestamp', 'genres'], axis=1, inplace=True)

In [22]:
# Use Pivot table;
movies_data = pd.pivot_table(movies_data, values='rating', index='userId', columns='title')

In [25]:
# Drop movies with user ratings less than N, where N is an integer representing the number of ratings
# Let N be 15
movies_data.dropna(axis=1, thresh=15, inplace=True)

In [30]:
# Fill NA values with 0
movies_data.fillna(value=0, inplace=True)

In [31]:
movies_data.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Young Guns (1988),Zack and Miri Make a Porno (2008),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
from sklearn.metrics.pairwise import cosine_similarity

# Standardize the ratings
def standardize(row):
    """
    :param row: Row entry from a DataFrame
    :return: Standardized values
    """
    
    new_row = (row - row.mean()) / (row.max() - row.min())
    return new_row
    

# Standardizing values across row
movie_ratings_std = movies_data.apply(standardize)

# Transpose matrix to calculate similarity between items
cosine_sim = cosine_similarity(movie_ratings_std.T)

In [46]:
# DataFrame containing similarity scores between across movies
cosine_sim_data = pd.DataFrame(cosine_sim, index=movies_data.columns, columns=movies_data.columns)

In [47]:
cosine_sim_data.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),13 Going on 30 (2004),...,Young Guns (1988),Zack and Miri Make a Porno (2008),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.063117,0.143482,0.011998,0.087931,0.224052,0.034223,0.009277,0.008331,0.0497,...,0.248535,0.017477,0.134701,0.153158,0.101301,0.049897,0.003233,0.187953,0.062174,0.353194
(500) Days of Summer (2009),0.063117,1.0,0.273989,0.19396,0.148903,0.142141,0.159756,0.135486,0.200135,0.297152,...,0.073476,0.374515,0.068407,0.414585,0.355723,0.252226,0.216007,0.053614,0.241092,0.125905
10 Things I Hate About You (1999),0.143482,0.273989,1.0,0.24467,0.223481,0.211473,0.011784,0.091964,0.043383,0.321071,...,0.152333,0.243118,0.13246,0.091853,0.158637,0.281934,0.050031,0.121029,0.130813,0.110612
"10,000 BC (2008)",0.011998,0.19396,0.24467,1.0,0.234459,0.119132,0.059187,-0.025882,0.089328,0.167098,...,0.065201,0.260261,0.094913,0.184521,0.242299,0.240231,0.094773,0.088045,0.203002,0.083518
101 Dalmatians (1996),0.087931,0.148903,0.223481,0.234459,1.0,0.285112,0.119843,0.072399,0.029967,0.188467,...,0.033582,0.114968,0.096294,0.067134,0.113224,0.184324,0.054024,0.047804,0.156932,0.078734


In [59]:
print("TOP 10 RECOMMENDATIONS")
print("")

get_similar_movies("'burbs, The (1989)", rating=5).iloc[:10]

TOP 10 RECOMMENDATIONS



title
'burbs, The (1989)                                                    2.500000
Back to School (1986)                                                 1.252169
Money Pit, The (1986)                                                 1.245261
Scrooged (1988)                                                       1.060256
Adventures of Buckaroo Banzai Across the 8th Dimension, The (1984)    1.005313
Rambo: First Blood Part II (1985)                                     0.994945
Weird Science (1985)                                                  0.981114
Christmas Vacation (National Lampoon's Christmas Vacation) (1989)     0.930469
¡Three Amigos! (1986)                                                 0.882986
Army of Darkness (1993)                                               0.874216
Name: 'burbs, The (1989), dtype: float64