# Movie Recommendation System
* To build a recommendation system, we need the data which we will download from [MovieLens](https://grouplens.org/datasets/movielens/)
* We will extract & load to colab

## Loading the libraries

In [3]:
import pandas as pd

## Loading the dataset

In [4]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## Data Preprocessing

Handling Missing Values
1. check if all the records have a title
2. Observe the genre column which dont have a genres

In [5]:
movies_df.isnull().sum() # Gives columns where data is null

Unnamed: 0,0
movieId,0
title,0
genres,0


There are no empty values in any column

The genre column which is a string should be broken using '|'

In [6]:
movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


To see the list of unique genres using frequency

In [7]:
from collections import Counter
genres_counts = Counter(g for genres in movies_df['genres']
                          for g in genres
                        )
print(genres_counts)

Counter({'Drama': 4361, 'Comedy': 3756, 'Thriller': 1894, 'Action': 1828, 'Romance': 1596, 'Adventure': 1263, 'Crime': 1199, 'Sci-Fi': 980, 'Horror': 978, 'Fantasy': 779, 'Children': 664, 'Animation': 611, 'Mystery': 573, 'Documentary': 440, 'War': 382, 'Musical': 334, 'Western': 167, 'IMAX': 158, 'Film-Noir': 87, '(no genres listed)': 34})


*Remove Movies with (no genres listed)*      
*Update the Genre Counts*

In [8]:
# Remove movies with '(no genres listed)'
movies_df = movies_df[movies_df['genres']!='(no genres listed)']
# Remove movies with '(no genres listed)'
movies = movies_df[movies_df['genres']!='(no genres listed)']

# Remove '(no genres listed)' from the genre counts dictionary
del genres_counts['(no genres listed)']

print(genres_counts)


Counter({'Drama': 4361, 'Comedy': 3756, 'Thriller': 1894, 'Action': 1828, 'Romance': 1596, 'Adventure': 1263, 'Crime': 1199, 'Sci-Fi': 980, 'Horror': 978, 'Fantasy': 779, 'Children': 664, 'Animation': 611, 'Mystery': 573, 'Documentary': 440, 'War': 382, 'Musical': 334, 'Western': 167, 'IMAX': 158, 'Film-Noir': 87})


## Feature Engineering

We create the genres column. Each genre is represented as a separate binary feature. "1" indicates that the movie falls under a given genre, while "0" does not.

In [9]:
genres = list(genres_counts.keys())
for genre in genres:
    movies_df[genre] = movies_df['genres'].apply(lambda x: 1 if genre in x else 0)
movies_df.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Cosine Similarity Scores

In [10]:
# Extract movie features
movie_features = movies_df.drop(columns=['movieId', 'title', 'genres'])
movie_features.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
!pip install scikit-learn



In [12]:
from sklearn.metrics.pairwise import cosine_similarity
sim_scores = cosine_similarity(movie_features, movie_features)
sim_scores

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

create a map of movie title and their index

In [13]:
movie_idx = dict(zip(movies_df['title'], list(movies_df.index)))

In [14]:
#test movie index
title = 'Father of the Bride Part II (1995)'
idx = movie_idx[title] #expecting idx to be 4
idx

4

Let's get the top 5 most similar movies

In [15]:
n_similar_movies = 5
# Recalculate sim_scores after filtering
movie_features = movies_df.drop(columns=['movieId', 'title', 'genres'])
sim_scores = cosine_similarity(movie_features, movie_features)
sim_scores_list = [i for i in enumerate(sim_scores[idx])]
sim_scores_list = sorted(sim_scores_list, key=lambda x: x[1], reverse=True)
sim_scores_list = sim_scores_list[1:n_similar_movies+1]
sim_scores_list

[(17, np.float64(1.0)),
 (18, np.float64(1.0)),
 (58, np.float64(1.0)),
 (61, np.float64(1.0)),
 (79, np.float64(1.0))]

In [16]:
similar_movies = [i[0] for i in sim_scores_list]

In [17]:
print(f"Because you watched {title}:")
movies['title'].iloc[similar_movies]

Because you watched Father of the Bride Part II (1995):


Unnamed: 0,title
17,Four Rooms (1995)
18,Ace Ventura: When Nature Calls (1995)
58,Bio-Dome (1996)
61,Friday (1995)
79,Black Sheep (1996)


we will define function for the same

In [18]:
def get_content_based_recommendations(title_string, n_recommendations=5):
    idx = movie_idx.get(title_string)
    if idx is None:
        print(f"No movie found with title '{title_string}'")
        return
    sim_scores_list = list(enumerate(sim_scores[idx]))
    sim_scores_list = sorted(sim_scores_list, key=lambda x: x[1], reverse=True)
    sim_scores_list = sim_scores_list[1:(n_recommendations+1)]
    similar_movies = [i[0] for i in sim_scores_list]
    print(f"Recommendations for {title}:")
    print(movies['title'].iloc[similar_movies])

In [19]:
get_content_based_recommendations('Jumanji (1995)', 5)

Recommendations for Father of the Bride Part II (1995):
53             Indian in the Cupboard, The (1995)
109             NeverEnding Story III, The (1994)
767               Escape to Witch Mountain (1975)
1514    Darby O'Gill and the Little People (1959)
1556                          Return to Oz (1985)
Name: title, dtype: object


In [20]:
get_content_based_recommendations('Iron Man(1995)', 5)

No movie found with title 'Iron Man(1995)'


# Load the data

In [22]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [23]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [25]:
mean_ratings = ratings_df.groupby('movieId')[['rating']].mean()
lowest_rated = mean_ratings['rating'].idxmin()

# find movie Id with lowest average rating
movies[movies['movieId']==lowest_rated]

# retrieve ratings_df of the Lowest Rated Movie
ratings_df[ratings_df['movieId']==lowest_rated]

movies_df[movies_df['movieId']==lowest_rated]

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir
2689,3604,Gypsy (1962),[Musical],0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [28]:
highest_rated = mean_ratings['rating'].idxmax()
movies_df[movies_df['movieId'] == highest_rated]


Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir
48,53,Lamerica (1994),"[Adventure, Drama]",1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# retrieve ratings of the highestRated Movie
ratings_df[ratings_df['movieId']==highest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13368,85,53,5.0,889468268
96115,603,53,5.0,963180003


In [31]:
# Group Ratings by Movie and Compute Statistics:count and mean
movie_stats = ratings_df.groupby('movieId')['rating'].agg(['count', 'mean'])
movie_stats.head()

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.92093
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429


define a function for average bayseian rating

In [34]:
# Average number of ratings for a given movie
C = movie_stats['count'].mean()
# Average rating for a given movie
m = movie_stats['mean'].mean()
def bayesian_avg(ratings):
    bayesian_avg = (C*m+ratings.sum())/(C+ratings.count())
    return round(bayesian_avg, 3)

In [35]:
# Groups the ratings by movieId.
# Applies the bayesian_avg function to each group of ratings
bayesian_avg_ratings = ratings_df.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()

# renames columns
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')

In [36]:
# Adds movie titles to the movie_stats DataFrame
movie_stats = movie_stats.merge(movies[['movieId', 'title']])

# orders movies by their Bayesian average rating from highest to lowest.
movie_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
277,318,317,4.429022,4.392,"Shawshank Redemption, The (1994)"
659,858,192,4.289062,4.236,"Godfather, The (1972)"
2224,2959,218,4.272936,4.227,Fight Club (1999)
224,260,251,4.231076,4.193,Star Wars: Episode IV - A New Hope (1977)
46,50,204,4.237745,4.191,"Usual Suspects, The (1995)"


In [37]:
# orders movies by their Bayesian average rating from lowest to highest
movie_stats.sort_values('bayesian_avg', ascending=True).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
1172,1556,19,1.605263,2.19,Speed 2: Cruise Control (1997)
2679,3593,19,1.657895,2.224,Battlefield Earth (2000)
1372,1882,33,1.954545,2.267,Godzilla (1998)
1144,1499,27,1.925926,2.297,Anaconda (1997)
1988,2643,16,1.6875,2.307,Superman IV: The Quest for Peace (1987)


In [40]:
from scipy.sparse import csr_matrix
import numpy as np

def create_X(df):
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))

    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [41]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(ratings_df)

In [42]:
from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, k, metric='cosine'):
    X = X.T
    neighbour_ids = []

    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    # use k+1 since kNN output includes the movieId of interest
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
    kNN.fit(X)
    neighbour = kNN.kneighbors(movie_vec, return_distance=False)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [43]:
#  create a dictionary that maps movie IDs to titles:
movie_titles = dict(zip(movies_df['movieId'], movies_df['title']))

movie_id = 1

# find_similar_movies function to get similar movies for a specific movie ID using k-nearest neighbours
similar_movies = find_similar_movies(movie_id, X, movie_mapper, movie_inv_mapper, metric='cosine', k=10)
movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}:")
for i in similar_movies:
    print(movie_titles[i])

Because you watched Toy Story (1995):
Toy Story 2 (1999)
Jurassic Park (1993)
Independence Day (a.k.a. ID4) (1996)
Star Wars: Episode IV - A New Hope (1977)
Forrest Gump (1994)
Lion King, The (1994)
Star Wars: Episode VI - Return of the Jedi (1983)
Mission: Impossible (1996)
Groundhog Day (1993)
