In [63]:
#import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [69]:

# load the data
movie_data = pd.read_csv("movies.csv")
#movie_data.head(10)


In [70]:

# Extract the year from the "title" column
movie_data['year'] = movie_data['title'].str.extract('\((\d{4})\)', expand=False)

# Convert the "year" column to numeric data type
movie_data['year'] = pd.to_numeric(movie_data['year'], errors='coerce')

# Remove the year from the "title" column
movie_data['title'] = movie_data['title'].str.replace('\s*\(\d{4}\)', '', regex=True)

# Print the first 5 rows of the updated DataFrame
display(movie_data.head())


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [73]:
## clean the data 
movie_data.dropna
movie_data.isna
movie_data.duplicated
display(movie_data.head())


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men,Comedy|Romance,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II,Comedy,1995.0


In [74]:
##function that calculate cosine similarities
def cosine_sim(movie1, movie2, tfidf_matrix):
    idx1 = movie_data[movie_data['title'] == movie1].index[0]
    idx2 = movie_data[movie_data['title'] == movie2].index[0]
    return cosine_similarity(tfidf_matrix[idx1], tfidf_matrix[idx2])[0][0]


In [55]:
# convert genres to TF-IDF vectors
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movie_data['genres'])

In [75]:
# define function to get similar movies
def get_similar_movies(movie_title, tfidf_matrix):
    idx = movie_data[movie_data['title'] == movie_title].index[0]
    cosine_similarities = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    related_movies_indices = cosine_similarities.argsort()[:-6:-1]
    return [(movie_data['title'][i], cosine_similarities[i]) for i in related_movies_indices if i != idx]

In [76]:
# calculate similarities for each movie
similarities = []
for i, movie_title in enumerate(movie_data['title']):
    similar_movies = get_similar_movies(movie_title, tfidf_matrix)
    for j, similar_movie in enumerate(similar_movies):
        similarity_score = similar_movie[1]
        if similarity_score > 0.0:
            similarities.append({
                'movie': movie_title,
                'similar_movie': similar_movie[0],
                'similarity_score': similarity_score
            })
