
## Movie Recommender System - Collaborative filtering vs Content - Based Filtering
Objective: Implement and compare collaborative filtering and content based filtering approaches for building a simple movie recommendation system.

Requirements:
1. Obtain a movie rating dataset with user-item interactions and movie attributes.
2. Implement:
3. Collaborative filtering using a recommendation technique like matrix factorization
4. Content - Based filtering using item similarity based on movie attributes (eg. genre, director, actors)
5. Evaluate and compare the recommendation accuracy of both approaches using appropiate metrics like precision - recall or mean squared error.

### Loading Dataset (movies.csv, and ratings.csv)

In [None]:
import pandas as pd
import numpy as np

ratings_data = pd.read_csv('ratings.csv')
movies_data = pd.read_csv('movies.csv')

In [None]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [None]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Shape of Dataset (Number of Rows and columns)

In [None]:
movies_data.shape

(10329, 3)

In [None]:
ratings_data.shape

(105339, 4)

### Checking for null values

In [None]:
movies_data.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [None]:
ratings_data.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


### Merging both datasets

In [None]:
movie_ratings = pd.merge(ratings_data, movies_data, on='movieId')

In [None]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,1,24,1.5,1217895807,Powder (1995),Drama|Sci-Fi
2,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,4.0,1217896556,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,4.0,1217896523,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### Droping unnecessary column 'TimeStamp'

In [None]:
movie_ratings = movie_ratings.drop('timestamp', axis=1)

In [None]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,16,4.0,Casino (1995),Crime|Drama
1,1,24,1.5,Powder (1995),Drama|Sci-Fi
2,1,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,4.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,4.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [None]:
movie_ratings.shape

(105339, 5)

In [None]:
movie_ratings.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
title,0
genres,0


In [None]:
movie_ratings['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
4.0,28880
3.0,21729
5.0,14856
3.5,12237
4.5,8187
2.0,7943
2.5,5484
1.0,3258
1.5,1567
0.5,1198


### Content - Based Filtering

In [None]:
# Create a TF-IDF matrix based on movie genres

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
movies_data['genres'] = movies_data['genres'].fillna('')  # Handle missing values

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['genres'])

In [None]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24176 stored elements and shape (10329, 23)>

#### Cosine - Similarity

In [None]:
# Calculate cosine similarity between movies based on TF-IDF matrix
from sklearn.metrics.pairwise import cosine_similarity
movie_similarity = cosine_similarity(tfidf_matrix)

In [None]:
movie_similarity

array([[1.        , 0.79977247, 0.1589222 , ..., 0.2638368 , 0.        ,
        0.        ],
       [0.79977247, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1589222 , 0.        , 1.        , ..., 0.60235038, 0.        ,
        0.        ],
       ...,
       [0.2638368 , 0.        , 0.60235038, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

#### Creating a recommender for given Movie Title

In [None]:
movie_title = 'Toy Story (1995)'
top_n = 5

In [None]:
# Get the index of the movie that matches the title
movie_idx = movies_data[movies_data['title'] == movie_title].index[0]
movie_idx

np.int64(0)

In [None]:
# Get the pairwise similarity scores of all movies with the input movie
sim_scores = list(enumerate(movie_similarity[movie_idx]))
sim_scores

[(0, np.float64(1.0000000000000002)),
 (1, np.float64(0.7997724667187793)),
 (2, np.float64(0.15892219564040727)),
 (3, np.float64(0.14283842105881417)),
 (4, np.float64(0.26383679738827265)),
 (5, np.float64(0.0)),
 (6, np.float64(0.15892219564040727)),
 (7, np.float64(0.6440557403962359)),
 (8, np.float64(0.0)),
 (9, np.float64(0.26043939215663503)),
 (10, np.float64(0.14283842105881417)),
 (11, np.float64(0.13957079840778813)),
 (12, np.float64(0.8399773464563209)),
 (13, np.float64(0.0)),
 (14, np.float64(0.2549153353119111)),
 (15, np.float64(0.0)),
 (16, np.float64(0.0)),
 (17, np.float64(0.26383679738827265)),
 (18, np.float64(0.26383679738827265)),
 (19, np.float64(0.09971739549091323)),
 (20, np.float64(0.12366640781124245)),
 (21, np.float64(0.0)),
 (22, np.float64(0.0)),
 (23, np.float64(0.0)),
 (24, np.float64(0.0)),
 (25, np.float64(0.0)),
 (26, np.float64(0.4613887581541079)),
 (27, np.float64(0.0)),
 (28, np.float64(0.37889727525454875)),
 (29, np.float64(0.0)),
 (30, np

In [None]:
len(sim_scores)

10329

In [None]:
# Sort the movies based on the similarity scores in descending order
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores

[(0, np.float64(1.0000000000000002)),
 (1815, np.float64(1.0000000000000002)),
 (2496, np.float64(1.0000000000000002)),
 (2967, np.float64(1.0000000000000002)),
 (3166, np.float64(1.0000000000000002)),
 (3811, np.float64(1.0000000000000002)),
 (6617, np.float64(1.0000000000000002)),
 (6997, np.float64(1.0000000000000002)),
 (7382, np.float64(1.0000000000000002)),
 (7987, np.float64(1.0000000000000002)),
 (9215, np.float64(1.0000000000000002)),
 (9732, np.float64(1.0000000000000002)),
 (10052, np.float64(1.0000000000000002)),
 (1595, np.float64(0.9645673353083754)),
 (1675, np.float64(0.9645673353083754)),
 (2696, np.float64(0.9645673353083754)),
 (3420, np.float64(0.9645673353083754)),
 (3535, np.float64(0.9645673353083754)),
 (4314, np.float64(0.9645673353083754)),
 (4799, np.float64(0.9645673353083754)),
 (5539, np.float64(0.9645673353083754)),
 (6361, np.float64(0.9645673353083754)),
 (6526, np.float64(0.9645673353083754)),
 (7978, np.float64(0.9645673353083754)),
 (3379, np.float64

In [None]:
# Exclude the input movie itself from the recommendations and select the top N movies
sim_scores = sim_scores[1:top_n+1]
sim_scores

[(1815, np.float64(1.0000000000000002)),
 (2496, np.float64(1.0000000000000002)),
 (2967, np.float64(1.0000000000000002)),
 (3166, np.float64(1.0000000000000002)),
 (3811, np.float64(1.0000000000000002))]

In [None]:
# Get the indices of the recommended movies
movie_indices = []
for i in sim_scores:
  movie_indices.append(i[0])
movie_indices

[1815, 2496, 2967, 3166, 3811]

### Recommended Movies

In [None]:
# the titles of the recommended movies
movies_data.iloc[movie_indices]['title']

Unnamed: 0,title
1815,Antz (1998)
2496,Toy Story 2 (1999)
2967,"Adventures of Rocky and Bullwinkle, The (2000)"
3166,"Emperor's New Groove, The (2000)"
3811,"Monsters, Inc. (2001)"


In [None]:
movies_data.iloc[movie_indices]['genres']

Unnamed: 0,genres
1815,Adventure|Animation|Children|Comedy|Fantasy
2496,Adventure|Animation|Children|Comedy|Fantasy
2967,Adventure|Animation|Children|Comedy|Fantasy
3166,Adventure|Animation|Children|Comedy|Fantasy
3811,Adventure|Animation|Children|Comedy|Fantasy


### Collaborative Filtering

In [None]:
# Create a user-item matrix for collaborative filtering
user_item_matrix = movie_ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

In [None]:
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Apply Singular Value Decomposition (SVD)

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_

In [None]:
user_factors

array([[ 1.99369387e+01, -1.91344385e+00,  9.13071106e+00, ...,
         1.26679322e+00,  2.06422064e+00, -1.28298604e+00],
       [ 3.55568280e+00, -2.95063913e+00,  5.61051779e-01, ...,
        -5.30200480e-01,  9.27299979e-01, -8.90894935e-02],
       [ 9.49540034e+00, -9.11391478e+00,  2.51772047e+00, ...,
        -4.75406008e-01,  1.08282982e-01, -4.27929264e-01],
       ...,
       [ 1.94055490e+01, -5.18626933e+00, -1.12641738e+00, ...,
        -4.06932038e-01,  1.53731035e+00,  2.72452156e+00],
       [ 1.17486184e+01, -1.23050007e+00,  5.42955643e+00, ...,
         3.32708259e-01, -3.99612941e-01, -2.28589614e+00],
       [ 1.20270997e+02,  5.69096162e+01, -1.51143410e+02, ...,
         3.39758736e-01, -9.08679504e-01,  3.46453035e-03]])

In [None]:
item_factors

array([[ 0.0730054 ,  0.03205883,  0.01316214, ...,  0.00048837,
         0.00375772,  0.00065116],
       [-0.03885783, -0.00679116, -0.01269436, ...,  0.00092091,
         0.00730768,  0.00122787],
       [ 0.04536831,  0.02020447, -0.0022826 , ..., -0.00088961,
        -0.01818232, -0.00118614],
       ...,
       [-0.0149857 ,  0.00890833,  0.00562226, ..., -0.00042573,
         0.00163354, -0.00056764],
       [-0.03057327,  0.01004794,  0.00278025, ..., -0.00082147,
         0.00718283, -0.0010953 ],
       [ 0.05543382,  0.02275147,  0.00483261, ...,  0.00118367,
        -0.00397919,  0.00157822]])

In [None]:
# Predict ratings using dot product of user and item factors
predicted_ratings = np.dot(user_factors, item_factors)

In [None]:
predicted_ratings

array([[ 2.18023730e+00,  4.61894837e-02,  1.82759606e-01, ...,
        -2.79656836e-02,  2.56916596e-02, -3.72875781e-02],
       [ 3.10247746e+00,  1.81900631e-01,  1.65498270e+00, ...,
        -7.43570376e-03, -3.72670081e-02, -9.91427168e-03],
       [ 8.26596108e-01,  8.33583179e-01,  5.78086597e-01, ...,
         4.16981102e-03, -2.25687480e-02,  5.55974803e-03],
       ...,
       [ 1.32683594e+00,  2.09840682e-02,  2.31963114e-01, ...,
         3.32827601e-02, -8.21598831e-02,  4.43770135e-02],
       [ 7.40233177e-01,  4.58245828e-01,  1.91050956e-01, ...,
        -3.56912125e-03,  1.04601203e-01, -4.75882833e-03],
       [ 2.89113923e+00,  3.06496505e+00,  2.03766814e+00, ...,
         1.23126617e-03,  4.49263327e+00,  1.64168822e-03]])

In [None]:
# Calculate MSE for predicted ratings

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(user_item_matrix.values, predicted_ratings)
print("Mean Squared Error (MSE) for Collaborative Filtering:", mse)

Mean Squared Error (MSE) for Collaborative Filtering: 0.08567955034997762
