In [2]:
# Day 1: Data Loading & Exploration
import pandas as pd

# Load datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Display basic info
print("Movies Data:")
print(movies.head())
print("\nRatings Data:")
print(ratings.head())

# # Merge both datasets
data = pd.merge(ratings, movies, on='movieId')
print("\nMerged Data:")
print(data.head())

# Check nulls
print("\nMissing Values:\n", data.isnull().sum())


Movies Data:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Data:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Merged Data:
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995

In [3]:
# Day 2: Data Preprocessing

# Remove missing values
data.dropna(inplace=True)

# Remove duplicates
data.drop_duplicates(inplace=True)

# Filter data for active users and popular movies
min_movie_ratings = 50
min_user_ratings = 50

filtered_movies = data['movieId'].value_counts() > min_movie_ratings
filtered_users = data['userId'].value_counts() > min_user_ratings

data = data[data['movieId'].isin(filtered_movies[filtered_movies].index)]
data = data[data['userId'].isin(filtered_users[filtered_users].index)]

print("Filtered data shape:", data.shape)


Filtered data shape: (36214, 6)


In [5]:
# Day 3: Collaborative Filtering Model

from sklearn.metrics.pairwise import cosine_similarity

# Create user-item matrix
user_movie_matrix = data.pivot_table(index='userId', columns='title', values='rating').fillna(0)

# Compute cosine similarity
movie_similarity = cosine_similarity(user_movie_matrix.T)
movie_similarity_df = pd.DataFrame(movie_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

# Function to recommend movies
def recommend_movies(movie_name, n=5):
    if movie_name not in movie_similarity_df:
        return "Movie not found!"
    similar_scores = movie_similarity_df[movie_name].sort_values(ascending=False)[1:n+1]
    return list(similar_scores.index)

# Example
print(recommend_movies("Toy Story (1995)"))


['Jurassic Park (1993)', 'Forrest Gump (1994)', 'Toy Story 2 (1999)', 'Star Wars: Episode IV - A New Hope (1977)', 'Shrek (2001)']


In [6]:
# Day 4: Content-Based Filtering

from sklearn.feature_extraction.text import CountVectorizer

# Handle missing genres
movies['genres'] = movies['genres'].fillna('')

# Vectorize genres
vectorizer = CountVectorizer(token_pattern='[a-zA-Z0-9\-]+')
genre_matrix = vectorizer.fit_transform(movies['genres'])

# Compute cosine similarity
genre_similarity = cosine_similarity(genre_matrix)

# Recommend by genre
genre_sim_df = pd.DataFrame(genre_similarity, index=movies['title'], columns=movies['title'])

def recommend_by_genre(movie_name, n=5):
    if movie_name not in genre_sim_df:
        return "Movie not found!"
    similar_movies = genre_sim_df[movie_name].sort_values(ascending=False)[1:n+1]
    return list(similar_movies.index)

print(recommend_by_genre("Toy Story (1995)"))


['Turbo (2013)', 'Monsters, Inc. (2001)', 'Asterix and the Vikings (Astérix et les Vikings) (2006)', 'The Good Dinosaur (2015)', 'Shrek the Third (2007)']
