In [1]:
# Setup imports and sys.path
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import pandas as pd
import numpy as np
import plotly.express as px
from src.utils import load_data
from src.content_based import ContentBasedRecommender

In [2]:
# Load data
movies, ratings, tags, links = load_data('../data')

# Display movies info
print("Movies shape:", movies.shape)
display(movies.head())
print("Ratings shape:", ratings.shape)
display(ratings.head())
print("Tags shape:", tags.shape)
display(tags.head())
print("Links shape:", links.shape)
display(links.head())

Movies shape: (9742, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


Ratings shape: (100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Tags shape: (3683, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Links shape: (9742, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [3]:
# Initialize and train recommender
recommender = ContentBasedRecommender()
recommender.fit(movies)

print("Model trained. TF-IDF matrix shape:", recommender.tfidf_matrix.shape)
print("Similarity matrix shape:", recommender.similarity_matrix.shape)

Model trained. TF-IDF matrix shape: (9742, 24)
Similarity matrix shape: (9742, 9742)


In [4]:
# Generate recommendations
movie_title = "Toy Story (1995)"
top_n = 10
recommendations = recommender.recommend_movies(movie_title, top_n=top_n)

# Merge with movie titles
recommendations = recommendations.merge(movies[['movieId', 'title', 'genres']], on='movieId')
print(f"Top {top_n} recommendations for '{movie_title}':")
display(recommendations[['title', 'genres', 'content_score']])

Top 10 recommendations for 'Toy Story (1995)':


Unnamed: 0,title,genres,content_score
0,Antz (1998),Adventure Animation Children Comedy Fantasy,1.0
1,Toy Story 2 (1999),Adventure Animation Children Comedy Fantasy,1.0
2,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure Animation Children Comedy Fantasy,1.0
3,"Emperor's New Groove, The (2000)",Adventure Animation Children Comedy Fantasy,1.0
4,"Monsters, Inc. (2001)",Adventure Animation Children Comedy Fantasy,1.0
5,"Wild, The (2006)",Adventure Animation Children Comedy Fantasy,1.0
6,Shrek the Third (2007),Adventure Animation Children Comedy Fantasy,1.0
7,"Tale of Despereaux, The (2008)",Adventure Animation Children Comedy Fantasy,1.0
8,Asterix and the Vikings (Astérix et les Viking...,Adventure Animation Children Comedy Fantasy,1.0
9,Turbo (2013),Adventure Animation Children Comedy Fantasy,1.0


In [5]:
# Visualization 1: Recommendation Scores Bar Chart
fig_scores = px.bar(
    recommendations,
    x='title',
    y='content_score',
    title=f"Recommendation Scores for '{movie_title}'",
    labels={'title': 'Movie Title', 'content_score': 'Similarity Score'},
    color='content_score',
    color_continuous_scale='Viridis'
)
fig_scores.update_layout(xaxis_tickangle=45, showlegend=False)
fig_scores.show()



In [6]:
# Visualization 2: Genre Distribution Pie Chart
genre_counts = pd.Series([
    genre for genres in recommendations['genres'].str.split(' ') for genre in genres if genre
]).value_counts()
fig_genres = px.pie(
    names=genre_counts.index,
    values=genre_counts.values,
    title="Genre Distribution in Recommendations",
    color_discrete_sequence=px.colors.qualitative.Pastel
)
fig_genres.show()