In [None]:
import pandas as pd

# Load TMDB datasets
tmdb_movies = pd.read_csv("../data/tmdb_5000_movies.csv")
tmdb_credits = pd.read_csv("../data/tmdb_5000_credits.csv")

# Load MovieLens ratings
ratings = pd.read_csv("../data/ratings.csv")

print("TMDB Movies:", tmdb_movies.shape)
print("TMDB Credits:", tmdb_credits.shape)
print("Ratings:", ratings.shape)

# Merge TMDB movies and credits on movie ID
movies = tmdb_movies.merge(tmdb_credits, left_on="id", right_on="movie_id")

# print(movies.shape)
movies.head(2)

# Select relevant columns
movies = movies[['id','original_title','overview','genres','keywords','cast','crew']]
movies.head(2)

# Convert JSON-like text to lists
import ast

def extract_names(obj):
    try:
        items = ast.literal_eval(obj)
        return [i['name'].lower().replace(" ", "") for i in items]
    except:
        return []
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)

movies[['original_title','genres','keywords']].head(2)

#   Extract top 3 cast members
def get_top3_cast(obj):
    try:
        items = ast.literal_eval(obj)
        names = [i['name'].lower().replace(" ", "") for i in items[:3]]
        return names
    except:
        return []
    
movies['cast'] = movies['cast'].apply(get_top3_cast)
movies[['original_title','cast']].head(2)

#   Extract director from crew
def get_director(obj):
    try:
        items = ast.literal_eval(obj)
        for i in items:
            if i['job'] == 'Director':
                return [i['name'].lower().replace(" ", "")]
        return []
    except:
        return []

movies['director'] = movies['crew'].apply(get_director)
movies[['original_title','director']].head(2)

#   Process overview text
movies['overview'] = movies['overview'].fillna("").apply(lambda x: x.lower().split())
movies[['original_title','overview']].head(2)

#   Create movie profile by combining all features
movies['movie_profile'] = (
    movies['overview'] +
    movies['genres'] +
    movies['keywords'] +
    movies['cast'] +
    movies['director']
)
movies['movie_profile'] = movies['movie_profile'].apply(lambda x: " ".join(x))
movies[['original_title','movie_profile']].head(2)

#   Final selection of columns
final_movies = movies[['id','original_title','movie_profile']]
final_movies.head()

# Vectorize movie profiles
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_movies['movie_profile']).toarray()

print(vectors.shape)

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)
print(similarity.shape)

#   Create a mapping from movie titles to indices
movie_index = pd.Series(final_movies.index, 
                        index=final_movies['original_title']).drop_duplicates()
movie_index["Inception"]

#   Recommendation function
def recommend(movie):
    # Get index of the movie
    idx = movie_index.get(movie)

    if idx is None:
        print("Movie not found in dataset")
        return

    # Get similarity scores for this movie
    sim_scores = list(enumerate(similarity[idx]))

    # Sort movies based on similarity (excluding itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]

    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return top 10 similar movies
    return final_movies['original_title'].iloc[movie_indices]

recommend("Inception")
print(recommend("The Dark Knight"))
print()
print(recommend("The Godfather"))



In [4]:
import pandas as pd

ratings = pd.read_csv("../data/ratings.csv")
ratings.head()

print("Total ratings:", len(ratings))
print("Unique users:", ratings['userId'].nunique())
print("Unique movies:", ratings['movieId'].nunique())
print("Average ratings per user:", len(ratings) / ratings['userId'].nunique())
print("Average ratings per movie:", len(ratings) / ratings['movieId'].nunique())


user_movie_matrix = ratings.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

print(user_movie_matrix.shape)
user_movie_matrix.head()

missing = user_movie_matrix.isna().sum().sum()
total = user_movie_matrix.size
print(f"Sparsity: {round((missing/total) * 100, 2)}%")


Total ratings: 100836
Unique users: 610
Unique movies: 9724
Average ratings per user: 165.30491803278687
Average ratings per movie: 10.369806663924312
(610, 9724)
Sparsity: 98.3%


In [5]:
links = pd.read_csv("../data/links.csv")
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
