# Movie Recommender System

Recommender system that allows users to input a movie and then recommends ten other movies to watch. 

In [1]:
# import modules
import numpy as np
import pandas as pd

# importing dataset
movies= pd.read_csv('movies.csv')
ratings= pd.read_csv('ratings.csv')

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings.groupby('userId')['rating'].count()


userId
1       232
2        29
3        39
4       216
5        44
       ... 
606    1115
607     187
608     831
609      37
610    1302
Name: rating, Length: 610, dtype: int64

In [5]:
ratings = ratings.drop(['timestamp'], axis=1)

In [6]:
#combine ratings for each movie into one average
grouped = ratings.groupby('movieId')['rating'].mean().reset_index()
ratings=ratings.merge(grouped, on = 'movieId')
ratings.rename(columns={
    'rating_y': 'avg_rating', 
    'rating_x': 'user_rating'},inplace=True)
print(ratings)

        userId  movieId  user_rating  avg_rating
0            1        1          4.0     3.92093
1            5        1          4.0     3.92093
2            7        1          4.5     3.92093
3           15        1          2.5     3.92093
4           17        1          4.5     3.92093
...        ...      ...          ...         ...
100831     610   160341          2.5     2.50000
100832     610   160527          4.5     4.50000
100833     610   160836          3.0     3.00000
100834     610   163937          3.5     3.50000
100835     610   163981          3.5     3.50000

[100836 rows x 4 columns]


In [7]:
ratings = ratings.groupby('movieId').first().reset_index()

In [8]:
ratings = ratings.drop(columns=['userId', 'user_rating'])

In [9]:
ratings.head()

Unnamed: 0,movieId,avg_rating
0,1,3.92093
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429


In [10]:
df = movies.merge(ratings, on = 'movieId', how='inner')

In [11]:
df.head()

Unnamed: 0,movieId,title,genres,avg_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429


In [14]:
df = df[df['genres']!= '(no genres listed)']

In [15]:
df.head(5)

Unnamed: 0,movieId,title,genres,avg_rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429


In [17]:
# import module and split genres with tfidfVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

def token(text):
    return text.split('|')
vectorizer1 = TfidfVectorizer(tokenizer = token, lowercase = False)

In [18]:
# fit genres data to vectorizer and turn into a dataframe
tfidf_matrix = vectorizer1.fit_transform(df['genres'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns= vectorizer1.get_feature_names_out())

genre_list = tfidf_df.columns.tolist()




In [19]:
# joining dataframes
data = pd.concat([df, tfidf_df], axis=1)
data.head()

Unnamed: 0,movieId,title,genres,avg_rating,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,0.0,0.416775,0.516403,0.504783,0.267318,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,Jumanji (1995),Adventure|Children|Fantasy,3.431818,0.0,0.51229,0.0,0.620467,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,Grumpier Old Men (1995),Comedy|Romance,3.259615,0.0,0.0,0.0,0.0,0.570321,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821422,0.0,0.0,0.0,0.0
3,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,0.0,0.0,0.0,0.0,0.504506,0.0,...,0.0,0.0,0.0,0.0,0.0,0.72663,0.0,0.0,0.0,0.0
4,5.0,Father of the Bride Part II (1995),Comedy,3.071429,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# cosine similarity for movie genres
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [65]:
# creating function to retrieve top 10 movie recommendations
def movie_recommender(moviename, rev_weight): 
    idx = data[data['title']== moviename].index[0]
    sim_scorer = list(enumerate(cosine_sim[idx])) # applying cosine similiarity algorithm to movie recommendation input
    sim_scorer = sorted(sim_scorer, key = lambda x: x[1], reverse = True) # sort list in descending order 
    
    
    sim_scorer = sim_scorer[1:40+1] # +1 excludes movie input from list
    movie_indices = [i[0] for i in sim_scorer] # retrieve movie index in df
    work_data = data.iloc[movie_indices] # subsetting data based on index
   
    weighted_results1 = []
    for x in sim_scorer:
        index = x[0]
        rating = data.loc[index, 'avg_rating']
        final_rating = (rating*rev_weight)+ sim_scorer[1][1] #rev_weight is weight applying to user ratings
        weighted_results1.append((data.loc[index, 'title'], final_rating)) 
    weighted_results1= sorted(weighted_results1, key = lambda x: x[1], reverse = True) # lambda x: x[1] = selecting which column to sort by | Reverse = descending order
    weighted_results1 = weighted_results1[0:10]
    recommended_movies = [x[0] for x in weighted_results1]# x[0]] first of the tuple
    
    
    return print('Based on your movie:', moviename, 'We found these 10 movie recommendations: \n', recommended_movies)

In [67]:
moviename = input('Enter movie name for similar recommendations ')

Enter movie name for similar recommendations  Jumanji (1995)


In [68]:
movie_recommender(moviename, 0.1) # rev_weight = 0.1 (10% importance)

Based on your movie: Jumanji (1995) We found these 10 movie recommendations: 
 ["Last Year's Snow Was Falling (1983)", 'Casper Meets Wendy (1998)', 'Bill Burr: You People Are All the Same (2012)', 'Water Horse: Legend of the Deep, The (2007)', 'Alice in Wonderland (1933)', 'Cinderella (2015)', 'Asterix & Obelix vs. Caesar (Astérix et Obélix contre César) (1999)', 'Asterix at the Olympic Games (Astérix aux jeux olympiques) (2008)', 'Chronicles of Narnia: The Voyage of the Dawn Treader, The (2010)', "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)"]


References:
https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada 