### Movie Recommender

Import Libraries

In [36]:
import pandas as pd
import numpy as np

from surprise import Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import IncrementalPCA
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns


Import Dataset

https://grouplens.org/datasets/movielens/20m/

In [2]:
movies = pd.read_csv('Data/movies.csv')
ratings = pd.read_csv('Data/ratings.csv')
tags = pd.read_csv('Data/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


Merge Movies and Tags

In [6]:
# Merging movies and tags DataFrames on 'movieId'
metadata = pd.merge(movies, tags, on='movieId')
metadata.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1644,Watched,1417736680
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741,computer animation,1183903155
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741,Disney animated feature,1183933307
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741,Pixar animation,1183934770
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1741,TÃ©a Leoni does not star in this movie,1245093573


In [10]:
# Combine all tags for each movie into a single string
metadata['all_tags'] = metadata.groupby('movieId')['tag'].transform(lambda x: ' '.join(x.astype(str)))
metadata = metadata.drop_duplicates(subset='movieId')

Build Tfidf Vectorizer and TruncatedSVD for Content Filter - Latent Matrix 1

In [11]:
# Initialize TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data
tfidf_matrix = tfidf.fit_transform(metadata['all_tags'])

# Initialize TruncatedSVD
svd = TruncatedSVD(n_components=50)

# Fit and transform the tfidf matrix
latent_matrix_1 = svd.fit_transform(tfidf_matrix)

# Display the shape of the latent matrix
latent_matrix_1.shape

(19545, 50)

Using Tfidfvectorizer and TruncatedSVD as a recommender system

In [16]:
cosine_sim = cosine_similarity(latent_matrix_1, latent_matrix_1)

In [59]:
def get_movie_recommendations_content(movie_title, metadata, cosine_sim, num_recommendations=5):
    # Get the index of the movie that matches the title
    idx = metadata[metadata['title'] == movie_title].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return only movie titles as a list
    return metadata['title'].iloc[movie_indices].tolist()


In [60]:
try:
    recommendations = get_movie_recommendations_content('Toy Story (1995)', metadata, cosine_sim)
    print(recommendations)
except IndexError as e:
    print(e)

['Aladdin (1992)', 'Alien³ (a.k.a. Alien 3) (1992)', 'Hercules (1997)', 'Mulan (1998)', 'Thing, The (1982)']


Create Collaborative Filter on User-Movie Matrix

In [22]:
print(ratings.isnull().sum())
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['rating'] = ratings['rating'].astype(float)

userId     0
movieId    0
rating     0
dtype: int64


In [None]:
# Drop any rows with missing values
ratings = ratings.dropna(subset=['userId', 'movieId', 'rating'])

In [40]:
# Get unique users and movies
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

# Create a mapping for users and movies to a 0-based index (for efficient matrix operations)
user_to_index = {user_id: i for i, user_id in enumerate(user_ids)}
movie_to_index = {movie_id: i for i, movie_id in enumerate(movie_ids)}
index_to_movie = {i: movie_id for movie_id, i in movie_to_index.items()}

In [43]:
# Map the userId and movieId to indices
ratings['user_idx'] = ratings['userId'].map(user_to_index)
ratings['movie_idx'] = ratings['movieId'].map(movie_to_index)
movie_id_to_title = dict(zip(movies['movieId'], movies['title']))

In [34]:
# Create a sparse matrix (rows = users, columns = movies)
user_movie_matrix = csr_matrix(
    (ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])),
    shape=(len(user_ids), len(movie_ids))
)

# Print sparse matrix shape
print("Sparse Matrix Shape:", user_movie_matrix.shape)

Sparse Matrix Shape: (138493, 26744)


In [37]:
# Fit NearestNeighbors model on sparse user-movie matrix
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_movie_matrix)

In [55]:
def get_movie_recommendations_collab(user_id, num_recommendations=5):
    if user_id not in user_to_index:
        return "User not found."
    
    user_index = user_to_index[user_id]
    
    # Find similar users
    distances, indices = knn.kneighbors(user_movie_matrix[user_index], n_neighbors=6)  # Include the user itself

    similar_users = indices.flatten()[1:]  # Exclude the user itself
    recommended_movies = set()

    # Collect movies watched by similar users
    for sim_user in similar_users:
        user_ratings = user_movie_matrix[sim_user].toarray().flatten()
        recommended_movie_indices = np.where(user_ratings > 0)[0]  # Movies rated by the similar user
        recommended_movies.update(recommended_movie_indices)

    # Remove movies the user has already seen
    user_rated_movies = set(np.where(user_movie_matrix[user_index].toarray().flatten() > 0)[0])
    recommended_movies -= user_rated_movies

    # Get the top `num_recommendations` movies
    recommended_movies = list(recommended_movies)[:num_recommendations]

    # Convert movie indices to movie titles
    return [movie_id_to_title[index_to_movie[movie_idx]] for movie_idx in recommended_movies if index_to_movie[movie_idx] in movie_id_to_title]

In [None]:
print(get_movie_recommendations_collab(1))

['Porco Rosso (Crimson Pig) (Kurenai no buta) (1992)', 'Charly (1968)', 'Monty Python Live at the Hollywood Bowl (1982)', 'Batman & Robin (1997)', 'Silent Hill (2006)']


In [46]:
k = 20  

# Apply Truncated SVD
svd = TruncatedSVD(n_components=k)
latent_matrix_2 = svd.fit_transform(user_movie_matrix)

# Print shape of the new latent matrix
print("Latent Matrix 2 Shape:", latent_matrix_2.shape)

Latent Matrix 2 Shape: (138493, 20)


### Hybrid Recommendation System

1. Popularity Based
2. Content Filtering
3. Collabrative Filtering
4. Matrix Factorization

In [48]:
def get_popular_movies(movies, ratings, num_recommendations=5):
    popular_movies = ratings.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).reset_index()
    popular_movies = popular_movies[popular_movies['userId'] > 50]  # Filter out movies with few ratings
    popular_movies = popular_movies.sort_values(by='rating', ascending=False)

    return movies[movies['movieId'].isin(popular_movies['movieId'].head(num_recommendations))]


In [50]:
# Ensure both matrices have the same number of rows (movies)
min_rows = min(latent_matrix_1.shape[0], latent_matrix_2.shape[0])

# Combine both latent representations
latent_matrix_combined = np.concatenate(
    (latent_matrix_1[:min_rows], latent_matrix_2[:min_rows]), 
    axis=1
)

# Compute cosine similarity on the combined latent matrix
cosine_sim_combined = cosine_similarity(latent_matrix_combined)

In [51]:
def get_matrix_factorization_recommendations(movie_title, metadata, num_recommendations=5):
    if movie_title not in metadata['title'].values:
        return []

    movie_idx = metadata[metadata['title'] == movie_title].index[0]
    
    # Compute similarity scores
    sim_scores = list(enumerate(cosine_sim_combined[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    
    # Get the top similar movies
    movie_indices = [i[0] for i in sim_scores]
    
    return metadata['title'].iloc[movie_indices].tolist()

Popularity Recommender

In [65]:
popular_movies = get_popular_movies(movies, ratings, num_recommendations=5)
print(popular_movies['title'].tolist())

['Usual Suspects, The (1995)', 'Shawshank Redemption, The (1994)', "Schindler's List (1993)", 'Godfather, The (1972)', 'Godfather: Part II, The (1974)']


Content-based filtering

In [None]:
# Example: Get 5 movies similar to "Toy Story"
content_recommendations = get_movie_recommendations_content("Toy Story (1995)", metadata, cosine_sim, num_recommendations=5)
print(content_recommendations)

['Aladdin (1992)', 'Alien³ (a.k.a. Alien 3) (1992)', 'Hercules (1997)', 'Mulan (1998)', 'Thing, The (1982)']


In [70]:
# Example: Get 5 movies similar to "Schindler's List (1993)"
content_recommendations = get_movie_recommendations_content("Shawshank Redemption, The (1994)", metadata, cosine_sim, num_recommendations=5)
print(content_recommendations)

['Merry Gentleman, The (2008)', 'Revenge of Frankenstein, The (1958)', 'Vampire Lovers, The (1970)', 'Dracula: Prince of Darkness (1966)', 'Dracula Has Risen from the Grave (1968)']


Collaborative Filtering

In [None]:
# Example: Get 5 recommendations for user with ID 1
collab_recommendations = get_movie_recommendations_collab(1, num_recommendations=5)
print(collab_recommendations)

['Porco Rosso (Crimson Pig) (Kurenai no buta) (1992)', 'Charly (1968)', 'Monty Python Live at the Hollywood Bowl (1982)', 'Batman & Robin (1997)', 'Silent Hill (2006)']


In [71]:
# Example: Get 5 recommendations for user with ID 10
collab_recommendations = get_movie_recommendations_collab(10, num_recommendations=5)
print(collab_recommendations)

['Ruthless People (1986)', 'Peggy Sue Got Married (1986)', 'Terminator 2: Judgment Day (1991)', 'Wizard of Oz, The (1939)', 'Platoon (1986)']


Matrix Factorization

In [None]:
# Example: Get 5 recommendations for "Toy Story" using Matrix Factorization
mf_recommendations = get_matrix_factorization_recommendations("Toy Story (1995)", metadata, num_recommendations=5)
print(mf_recommendations)

['Warped Ones, The (Kyonetsu no kisetsu) (1960)', 'Salt of the Earth (1954)', 'Ornamental Hairpin (Kanzashi) (1941)', "Wesley Willis: The Daddy of Rock 'n' Roll (2003)", 'Made in Jamaica (2006)']


In [72]:
# Example: Get 5 recommendations for "Usual Suspects, The (1995)" using Matrix Factorization
mf_recommendations = get_matrix_factorization_recommendations("Usual Suspects, The (1995)", metadata, num_recommendations=5)
print(mf_recommendations)

['Italian, The (Italianetz) (2005)', 'Macbeth (1948)', 'People That Time Forgot, The (1977)', 'Stormy Monday (1988)', 'Sexy Beast (2000)']
