In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
file_path = "C:/TYBCS/archive/movies.csv"
movies = pd.read_csv(file_path)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


We need to handle missing data before processing.


In [4]:
movies.isnull().sum()


movieId    0
title      0
genres     0
dtype: int64

We will fill missing values in the 'genres' column with an empty string.

In [19]:
movies['genres'] = movies['genres'].fillna('')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


will use `CountVectorizer` to tokenize genres and convert them into a numerical format.

In [24]:
count_vectorizer = CountVectorizer(tokenizer=lambda x: x.split('|'), token_pattern=None)
genre_matrix = count_vectorizer.fit_transform(movies['genres'])
genre_matrix.shape

(9742, 20)

We calculate cosine similarity to measure how similar each movie is based on genres.

In [27]:
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

This function takes a movie title as input and returns the top 5 most similar movies based on genres.

In [30]:
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in movies['title'].values:
        return "Movie not found in dataset."

    idx = movies[movies['title'] == title].index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:6]

    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices].tolist()


We will test the system by getting recommendations for "Toy Story (1995)".

In [33]:
movie_name = "Toy Story (1995)"

recommended_movies = get_recommendations(movie_name)

print(f"Recommended movies similar to '{movie_name}':")
print(recommended_movies)

Recommended movies similar to 'Toy Story (1995)':
['Antz (1998)', 'Toy Story 2 (1999)', 'Adventures of Rocky and Bullwinkle, The (2000)', "Emperor's New Groove, The (2000)", 'Monsters, Inc. (2001)']
