In [2]:
import os
#garbage collection interface
import gc

# data science imports
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

#Fuzzy string matching like a boss. 
#It uses Levenshtein Distance to calculate the differences between sequences in a simple-to-use package.
from fuzzywuzzy import fuzz

## Data Preparation

In [6]:
#Read in the data
df_movies = pd.read_csv("ml-latest-small/movies.csv",
                        usecols=['movieId', 'title'],
                        dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv("ml-latest-small/ratings.csv",
            usecols=['userId', 'movieId', 'rating'],
            dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [7]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
df_movies_cnt = pd.DataFrame(
            df_ratings.groupby('movieId').size(),
            columns=['count'])

popular_movies = list(set(df_movies_cnt.query('count >= 2').index))  #less than 2 would not be considered
movies_filter = df_ratings.movieId.isin(popular_movies).values

In [12]:
df_users_cnt = pd.DataFrame(
            df_ratings.groupby('userId').size(),
            columns=['count'])
active_users = list(set(df_users_cnt.query('count >= 2').index))  # noqa
users_filter = df_ratings.userId.isin(active_users).values

In [14]:
df_ratings_filtered = df_ratings[movies_filter & users_filter]
df_ratings_filtered.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [16]:
#Creating pivot table of userid * movieid
movie_user_mat = df_ratings_filtered.pivot(
            index='movieId', columns='userId', values='rating').fillna(0)
movie_user_mat.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# create mapper from movie title to index
hashmap = {
            movie: i for i, movie in
            enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
        }

In [19]:
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [20]:
# clean up
del df_movies, df_movies_cnt, df_users_cnt
del df_ratings, df_ratings_filtered, movie_user_mat
gc.collect()

28253

## Utility Functions

In [24]:
def fuzzy_matching(hashmap, fav_movie):
        """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
        """
        match_tuple = []
        # get match
        for title, idx in hashmap.items():
            ratio = fuzz.ratio(title.lower(), fav_movie.lower())
            if ratio >= 60:
                match_tuple.append((title, idx, ratio))
        # sort
        match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
        if not match_tuple:
            print('Oops! No match is found')
        else:
            print('Found possible matches in our database: '
                  '{0}\n'.format([x[0] for x in match_tuple]))
            return match_tuple[0][1]

## Data Modeling

In [108]:
#Recommending 5 movies based on the movie 'iron man'
fav_movie = 'beauty and beast'
idx = fuzzy_matching(hashmap, fav_movie)

Found possible matches in our database: ['Beauty and the Beast (2017)', 'Beauty and the Beast (1991)']



In [109]:
model = NearestNeighbors(algorithm='auto', metric = 'cosine')

In [110]:
model.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

## Making Recommendation

In [111]:
distances, indices = model.kneighbors(
            movie_user_mat_sparse[idx],
            n_neighbors=10)

In [112]:
raw_recommends = \
            sorted(
                list(
                    zip(
                        indices.squeeze().tolist(),
                        distances.squeeze().tolist()
                    )
                ),
                key=lambda x: x[1]
            )[:0:-1]

In [113]:
reverse_hashmap = {v: k for k, v in hashmap.items()}
print('Recommendations for {}:'.format(fav_movie))
for i, (idx, dist) in enumerate(raw_recommends):
    print('{0}: {1}, with distance '
        'of {2}'.format(i+1, reverse_hashmap[idx], dist))

Recommendations for beauty and beast:
1: Isle of Dogs (2018), with distance of 0.514034628868103
2: Way, Way Back, The (2013), with distance of 0.5017606616020203
3: The Greatest Showman (2017), with distance of 0.4983522295951843
4: Bright (2017), with distance of 0.4825906753540039
5: Marvel One-Shot: Item 47 (2012), with distance of 0.4790540933609009
6: The Edge of Seventeen (2016), with distance of 0.39259469509124756
7: Jumanji: Welcome to the Jungle (2017), with distance of 0.38346028327941895
8: Oldboy (2013), with distance of 0.34901320934295654
9: Art of Getting By, The (2011), with distance of 0.2844582200050354
