In [40]:
# Upload your local movies.csv and ratings.csv files
from google.colab import files
uploaded = files.upload()


Saving ratings.csv to ratings (3).csv
Saving movies.csv to movies (4).csv


In [41]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


In [42]:
# Load movies and ratings data
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

# Preview data
print("Movies:\n", movies.head())
print("\nRatings:\n", ratings.head())


Movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings:
    userId  movieId  rating   timestamp
0       1       16     4.0  1217897793
1       1       24     1.5  1217895807
2       1       32     4.0  1217896246
3       1       47     4.0  1217896556
4       1       50     4.0  1217896523


In [43]:
# Create a pivot table with users as rows and movies as columns
user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

print("User-Movie Matrix shape:", user_movie_matrix.shape)


User-Movie Matrix shape: (668, 10325)


In [44]:
# Compute cosine similarity between users for collaborative filtering
user_similarity = cosine_similarity(user_movie_matrix)

# Convert to DataFrame for easy lookup
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

print("User similarity matrix (sample):\n", user_similarity_df.iloc[:5, :5])


User similarity matrix (sample):
 userId         1         2         3         4         5
userId                                                  
1       1.000000  0.101113  0.210044  0.128766  0.057896
2       0.101113  1.000000  0.115559  0.034610  0.032705
3       0.210044  0.115559  1.000000  0.058208  0.044426
4       0.128766  0.034610  0.058208  1.000000  0.019298
5       0.057896  0.032705  0.044426  0.019298  1.000000


In [45]:
# Prepare genres for vectorization
movies['genres'] = movies['genres'].str.replace('|', ' ')

# Convert genres to numerical vectors using Bag-of-Words
vectorizer = CountVectorizer()
genre_matrix = vectorizer.fit_transform(movies['genres'])

# Compute cosine similarity between movies
genre_similarity = cosine_similarity(genre_matrix)

print("Genre similarity shape:", genre_similarity.shape)


Genre similarity shape: (10329, 10329)


In [46]:
# Recommend similar movies based on genres
def get_content_based_recommendations(movie_id, top_n=5):
    movie_idx = movies[movies['movieId'] == movie_id].index[0]
    sim_scores = list(enumerate(genre_similarity[movie_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_movies = [movies.iloc[i[0]].movieId for i in sim_scores[1:top_n+1]]  # Skip self
    return top_movies


In [47]:
def get_collaborative_recommendations(user_id, top_n=5):
    # Get similar users, excluding the user themselves
    similar_users = user_similarity_df[user_id].drop(user_id)

    # Only consider users that exist in the matrix
    similar_users = similar_users[similar_users.index.isin(user_movie_matrix.index)]

    # Compute weighted average ratings from similar users
    weighted_ratings = user_movie_matrix.loc[similar_users.index].T.dot(similar_users)
    scores = weighted_ratings / similar_users.sum()

    # Remove movies the user has already rated
    known_movies = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id] > 0].index
    scores = scores.drop(known_movies, errors='ignore')

    # Return top N movie IDs
    return scores.sort_values(ascending=False).head(top_n).index.tolist()


In [48]:
# Try generating recommendations for a user
user_id = 1  # Change this to test other users
recommendations = hybrid_recommendation(user_id)

print("Top movie recommendations for user", user_id)
print(recommendations)


Top movie recommendations for user 1
                       title                                       genres
70          Screamers (1995)                       Action Sci-Fi Thriller
147   Johnny Mnemonic (1995)                       Action Sci-Fi Thriller
637      Phantom, The (1996)                             Action Adventure
1815             Antz (1998)  Adventure Animation Children Comedy Fantasy
2496      Toy Story 2 (1999)  Adventure Animation Children Comedy Fantasy
