# Movie Recommendations


Load the necessary libraries

In [240]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

## Preprocessing data
Load the data

In [229]:
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')

## Preprocess Movies Data

In [230]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [231]:
movies_df['genres'] = movies_df.apply(lambda row: row.genres.split('|'), axis = 1)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


## Make list of genres

In [232]:
genres = "Action* Adventure* Animation* Children's* Comedy* Crime* Documentary* Drama* Fantasy* Film-Noir* Horror* Musical* Mystery* Romance* Sci-Fi* Thriller* War* Western* (no genres listed)".split("* ")
genres

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western',
 '(no genres listed)']

## Check ratings

In [233]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [236]:
ratings_df = ratings_df.merge(movies_df, how = 'left', on = 'movieId')
ratings_df.head(15)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,964981247,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,964982224,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,964982931,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"
5,1,70,3.0,964982400,From Dusk Till Dawn (1996),"[Action, Comedy, Horror, Thriller]"
6,1,101,5.0,964980868,Bottle Rocket (1996),"[Adventure, Comedy, Crime, Romance]"
7,1,110,4.0,964982176,Braveheart (1995),"[Action, Drama, War]"
8,1,151,5.0,964984041,Rob Roy (1995),"[Action, Drama, Romance, War]"
9,1,157,5.0,964984100,Canadian Bacon (1995),"[Comedy, War]"


# Knowledge based Recommender System
This is used if we have no user data. This is based on popularity

In [145]:
avg_highly_rated_movies = ratings_df.groupby(['title']).agg({'rating': 'mean'})['rating'].sort_values(ascending=False)
avg_highly_rated_movies

title
Gena the Crocodile (1969)                  5.0
True Stories (1986)                        5.0
Cosmic Scrat-tastrophe (2015)              5.0
Love and Pigeons (1985)                    5.0
Red Sorghum (Hong gao liang) (1987)        5.0
                                          ... 
Don't Look Now (1973)                      0.5
Journey 2: The Mysterious Island (2012)    0.5
Joe Dirt 2: Beautiful Loser (2015)         0.5
Jesus Christ Vampire Hunter (2001)         0.5
Fullmetal Alchemist 2018 (2017)            0.5
Name: rating, Length: 9719, dtype: float64

In [158]:
popular_movies = ratings_df.groupby(['title']).agg({"rating":"count"})['rating'].sort_values(ascending=False)
popular_movies

title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
                                            ... 
King Solomon's Mines (1950)                    1
King Solomon's Mines (1937)                    1
King Ralph (1991)                              1
King Kong Lives (1986)                         1
À nous la liberté (Freedom for Us) (1931)      1
Name: rating, Length: 9719, dtype: int64

In [206]:
popular_movies = popular_movies.where(popular_movies > 10).dropna()
popular_movies.head(15)

title
Forrest Gump (1994)                                      329.0
Shawshank Redemption, The (1994)                         317.0
Pulp Fiction (1994)                                      307.0
Silence of the Lambs, The (1991)                         279.0
Matrix, The (1999)                                       278.0
Star Wars: Episode IV - A New Hope (1977)                251.0
Jurassic Park (1993)                                     238.0
Braveheart (1995)                                        237.0
Terminator 2: Judgment Day (1991)                        224.0
Schindler's List (1993)                                  220.0
Fight Club (1999)                                        218.0
Toy Story (1995)                                         215.0
Star Wars: Episode V - The Empire Strikes Back (1980)    211.0
Usual Suspects, The (1995)                               204.0
American Beauty (1999)                                   204.0
Name: rating, dtype: float64

In [207]:
popularity_recommendations = pd.merge(popular_movies, avg_highly_rated_movies, how = 'inner', on = 'title')
popularity_recommendations.head(15)

Unnamed: 0_level_0,rating_x,rating_y
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),329.0,4.164134
"Shawshank Redemption, The (1994)",317.0,4.429022
Pulp Fiction (1994),307.0,4.197068
"Silence of the Lambs, The (1991)",279.0,4.16129
"Matrix, The (1999)",278.0,4.192446
Star Wars: Episode IV - A New Hope (1977),251.0,4.231076
Jurassic Park (1993),238.0,3.75
Braveheart (1995),237.0,4.031646
Terminator 2: Judgment Day (1991),224.0,3.970982
Schindler's List (1993),220.0,4.225


In [208]:
popularity_recommendations.rename({'rating_x': 'Number of users', 'rating_y': 'Average rating'}, axis = 1, inplace = True)
popularity_recommendations.sort_values(by = 'Average rating', ascending = False, inplace = True)
popularity_recommendations.head(10)

Unnamed: 0_level_0,Number of users,Average rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",317.0,4.429022
"Godfather, The (1972)",192.0,4.289062
Fight Club (1999),218.0,4.272936
Cool Hand Luke (1967),57.0,4.27193
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),97.0,4.268041
Rear Window (1954),84.0,4.261905
"Godfather: Part II, The (1974)",129.0,4.25969
"Departed, The (2006)",107.0,4.252336
Goodfellas (1990),126.0,4.25
Casablanca (1942),100.0,4.24


## Recommending based on Genre
sorting by genre

In [211]:
genre_recommendations = popularity_recommendations.merge(movies_df, how = 'left', on = 'title')
genre_recommendations

Unnamed: 0,title,Number of users,Average rating,movieId,genres
0,"Shawshank Redemption, The (1994)",317.0,4.429022,318,"[Crime, Drama]"
1,"Godfather, The (1972)",192.0,4.289062,858,"[Crime, Drama]"
2,Fight Club (1999),218.0,4.272936,2959,"[Action, Crime, Drama, Thriller]"
3,Cool Hand Luke (1967),57.0,4.271930,1276,[Drama]
4,Dr. Strangelove or: How I Learned to Stop Worr...,97.0,4.268041,750,"[Comedy, War]"
...,...,...,...,...,...
433,Johnny Mnemonic (1995),53.0,2.679245,172,"[Action, Sci-Fi, Thriller]"
434,Judge Dredd (1995),62.0,2.669355,173,"[Action, Crime, Sci-Fi]"
435,City Slickers II: The Legend of Curly's Gold (...,55.0,2.645455,432,"[Adventure, Comedy, Western]"
436,Coneheads (1993),63.0,2.420635,435,"[Comedy, Sci-Fi]"


In [212]:
genre_recommendations.drop('movieId', inplace = True, axis = 1)
genre_recommendations.head(15)

Unnamed: 0,title,Number of users,Average rating,genres
0,"Shawshank Redemption, The (1994)",317.0,4.429022,"[Crime, Drama]"
1,"Godfather, The (1972)",192.0,4.289062,"[Crime, Drama]"
2,Fight Club (1999),218.0,4.272936,"[Action, Crime, Drama, Thriller]"
3,Cool Hand Luke (1967),57.0,4.27193,[Drama]
4,Dr. Strangelove or: How I Learned to Stop Worr...,97.0,4.268041,"[Comedy, War]"
5,Rear Window (1954),84.0,4.261905,"[Mystery, Thriller]"
6,"Godfather: Part II, The (1974)",129.0,4.25969,"[Crime, Drama]"
7,"Departed, The (2006)",107.0,4.252336,"[Crime, Drama, Thriller]"
8,Goodfellas (1990),126.0,4.25,"[Crime, Drama]"
9,Casablanca (1942),100.0,4.24,"[Drama, Romance]"


In [193]:
genres

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western',
 '(no genres listed)']

In [209]:
# genre_recommendations['Year Released'] = genre_recommendations.apply(lambda row: row.genres.split('|'), axis = 1)

In [213]:
genre_recommendations[genre_recommendations['genres'].apply(lambda x: 'Adventure' in x)].head(15)

Unnamed: 0,title,Number of users,Average rating,genres
12,"Princess Bride, The (1987)",142.0,4.232394,"[Action, Adventure, Comedy, Fantasy, Romance]"
13,Star Wars: Episode IV - A New Hope (1977),251.0,4.231076,"[Action, Adventure, Sci-Fi]"
17,Star Wars: Episode V - The Empire Strikes Back...,211.0,4.21564,"[Action, Adventure, Sci-Fi]"
19,Raiders of the Lost Ark (Indiana Jones and the...,200.0,4.2075,"[Action, Adventure]"
24,North by Northwest (1959),57.0,4.184211,"[Action, Adventure, Mystery, Romance, Thriller]"
29,Monty Python and the Holy Grail (1975),136.0,4.161765,"[Adventure, Comedy, Fantasy]"
33,Spirited Away (Sen to Chihiro no kamikakushi) ...,87.0,4.155172,"[Adventure, Animation, Fantasy]"
37,City of God (Cidade de Deus) (2002),75.0,4.146667,"[Action, Adventure, Crime, Drama, Thriller]"
39,"Good, the Bad and the Ugly, The (Buono, il bru...",72.0,4.145833,"[Action, Adventure, Western]"
40,Star Wars: Episode VI - Return of the Jedi (1983),196.0,4.137755,"[Action, Adventure, Sci-Fi]"


# Recommendations based on User (KNN)
This will be taking into account what movies the user has watched and will recommend movies based on what similar users have enjoyed watching.

In [214]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,964981247,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,964982224,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,964982931,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),"[Drama, Horror, Thriller]"
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),"[Action, Crime, Thriller]"
100833,610,168250,5.0,1494273047,Get Out (2017),[Horror]
100834,610,168252,5.0,1493846352,Logan (2017),"[Action, Sci-Fi]"


In [218]:
refined_df = ratings_df.groupby(by=['userId', 'title'], as_index = False).agg({'rating': 'mean'})
refined_df.head(15)

Unnamed: 0,userId,title,rating
0,1,"13th Warrior, The (1999)",4.0
1,1,20 Dates (1998),4.0
2,1,"Abyss, The (1989)",4.0
3,1,"Adventures of Robin Hood, The (1938)",5.0
4,1,Alice in Wonderland (1951),5.0
5,1,Alien (1979),4.0
6,1,All Quiet on the Western Front (1930),5.0
7,1,American Beauty (1999),5.0
8,1,American History X (1998),5.0
9,1,"American Tail, An (1986)",5.0


In [221]:
user_to_movie_df = refined_df.pivot(index = 'userId', columns = 'title', values = 'rating').fillna(0)
user_to_movie_df.head(15)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [237]:
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
knn_model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_model.fit(user_to_movie_sparse_df)

NearestNeighbors(algorithm='brute', metric='cosine')

In [238]:
def get_similar_users(user, n = 5):
    knn_input = np.asarray([user_to_movie_df.values[user-1]])
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)
    
    print(f"Top {n} users who are very similar to User-{user} are: ")
    for i in range(1,len(distances[0])):
        print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    return indices.flatten()[1:] + 1, distances.flatten()[1:]

In [249]:
user = 5

similar_user_list, distance_list = get_similar_users(user)

Top 5 users who are very similar to User-5 are: 
1 . User: 470 separated by distance of 0.4793292042473569
2 . User: 229 separated by distance of 0.48266710413766245
3 . User: 565 separated by distance of 0.4886746437839188
4 . User: 235 separated by distance of 0.4923618395241436
5 . User: 142 separated by distance of 0.5258758000611337


In [250]:
movies_list = user_to_movie_df.columns
movies_list

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

In [251]:
len(movies_list)

9719

In [252]:
weightage_list = distance_list/np.sum(distance_list)
weightage_list

array([0.19414619, 0.19549817, 0.19793144, 0.1994249 , 0.2129993 ])

In [253]:
mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
mov_rtngs_sim_users

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [254]:
weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
weightage_list.shape

(5, 9719)

In [255]:
new_rating_matrix = weightage_list*mov_rtngs_sim_users
mean_rating_list = new_rating_matrix.sum(axis =0)
mean_rating_list

array([0., 0., 0., ..., 0., 0., 0.])

In [256]:
def recommend_movies(n):
  n = min(len(mean_rating_list),n)
  # print(np.argsort(mean_rating_list)[::-1][:n])
  print(list(movies_list[np.argsort(mean_rating_list)[::-1][:n]]))

In [257]:
print("Movies recommended based on similar users are: ")
recommend_movies(10)

Movies recommended based on similar users are: 
['Forrest Gump (1994)', 'Finding Nemo (2003)', 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)', "Schindler's List (1993)", 'Pretty Woman (1990)', 'Interview with the Vampire: The Vampire Chronicles (1994)', 'Toy Story 3 (2010)', 'Alien (1979)', 'Shawshank Redemption, The (1994)', 'Twister (1996)']


In [262]:
ratings_df.loc[ratings_df['userId'] == 5].sort_values('rating', ascending = False)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
553,5,590,5.0,847434747,Dances with Wolves (1990),"[Adventure, Drama, Western]"
546,5,475,5.0,847435311,In the Name of the Father (1993),[Drama]
548,5,527,5.0,847434960,Schindler's List (1993),"[Drama, War]"
522,5,58,5.0,847435238,"Postman, The (Postino, Il) (1994)","[Comedy, Drama, Romance]"
557,5,596,5.0,847435292,Pinocchio (1940),"[Animation, Children, Fantasy, Musical]"
556,5,595,5.0,847434832,Beauty and the Beast (1991),"[Animation, Children, Fantasy, Musical, Romanc..."
527,5,247,5.0,847435337,Heavenly Creatures (1994),"[Crime, Drama]"
555,5,594,5.0,847435238,Snow White and the Seven Dwarfs (1937),"[Animation, Children, Drama, Fantasy, Musical]"
533,5,296,5.0,847434748,Pulp Fiction (1994),"[Comedy, Crime, Drama, Thriller]"
532,5,290,5.0,847435311,Once Were Warriors (1994),"[Crime, Drama]"
