In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
movies = pd.read_csv("movies.csv")
movies.drop_duplicates(subset ="title",keep='first',inplace=True,ignore_index=True)

In [None]:
print(movies.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64


In [None]:
duplicates = movies[movies.duplicated()]
print(duplicates)

Empty DataFrame
Columns: [movieId, title, genres]
Index: []


In [None]:
# Prepare the data
movies['genres'] = movies['genres'].str.split('|')
movies['year'] = movies['title'].str.extract('\((\d{4})\)')
movies['title'] = movies['title'].str.replace('(\(\d{4}\))', '').str.strip()

  movies['title'] = movies['title'].str.replace('(\(\d{4}\))', '').str.strip()


In [None]:
# Create a feature vector for each movie
tfidf = TfidfVectorizer(stop_words='english')
movies['plot'] = movies['title'] + ' ' + movies['genres'].apply(lambda x: ' '.join(x))
tfidf_matrix = tfidf.fit_transform(movies['plot'])

# similarities between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Recommend
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    similarity_scores = [i[1] for i in sim_scores]
    recommended_movies = movies['title'].iloc[movie_indices]

    for i, (movie, score) in enumerate(zip(recommended_movies, similarity_scores), 1):
      print(f"{i}. {movie} - Similarity Score: {score}")


In [None]:
get_recommendations('Toy Story')

1. Toy Story 2 - Similarity Score: 1.0000000000000002
2. Toy Story 3 - Similarity Score: 0.9257843096519454
3. Toy, The - Similarity Score: 0.6878953284363422
4. We're Back! A Dinosaur's Story - Similarity Score: 0.5215787140340161
5. Story of Us, The - Similarity Score: 0.46783588558980754
6. Toy Soldiers - Similarity Score: 0.4645431133821312
7. Up - Similarity Score: 0.4636231223205045
8. NeverEnding Story, The - Similarity Score: 0.45255255094980057
9. L.A. Story - Similarity Score: 0.44322257439767243
10. Wild, The - Similarity Score: 0.42772117992407177


In [None]:
get_recommendations("We're Back! A Dinosaur's Story")

1. Dinosaur - Similarity Score: 0.8443716254779753
2. The Good Dinosaur - Similarity Score: 0.7531563059227538
3. Toy Story - Similarity Score: 0.5215787140340161
4. Toy Story 2 - Similarity Score: 0.5215787140340161
5. Toy Story 3 - Similarity Score: 0.48286938970113097
6. Up - Similarity Score: 0.45902798800282885
7. NeverEnding Story, The - Similarity Score: 0.44806714101810274
8. Story of Us, The - Similarity Score: 0.4158861823561897
9. NeverEnding Story III, The - Similarity Score: 0.39680926108802594
10. L.A. Story - Similarity Score: 0.39400599671387443


In [None]:
get_recommendations('Jumanji')

1. Jumanji: Welcome to the Jungle - Similarity Score: 0.6397980067921399
2. Up - Similarity Score: 0.32865673001937645
3. Wild, The - Similarity Score: 0.3261995718086005
4. Pan - Similarity Score: 0.3179415839897905
5. G-Force - Similarity Score: 0.31581550971274724
6. D.A.R.Y.L. - Similarity Score: 0.31228070345575076
7. Monsters, Inc. - Similarity Score: 0.30431360066367635
8. Now and Then - Similarity Score: 0.3014817101012759
9. Yours, Mine and Ours - Similarity Score: 0.29649666784503564
10. Are We There Yet? - Similarity Score: 0.29649666784503564


In [None]:
get_recommendations('Homeward Bound II: Lost in San Francisco')

1. San Francisco - Similarity Score: 0.5487160258836883
2. Homeward Bound: The Incredible Journey - Similarity Score: 0.5228354414648512
3. Bound - Similarity Score: 0.37628112972115657
4. All Is Lost - Similarity Score: 0.32961219747195314
5. Lost & Found - Similarity Score: 0.29053112225129263
6. San Andreas - Similarity Score: 0.24290276515785228
7. Wizards of the Lost Kingdom II - Similarity Score: 0.24150787999930767
8. Lost in Space - Similarity Score: 0.22592062354406844
9. Land of the Lost - Similarity Score: 0.22213329275361268
10. Lost in America - Similarity Score: 0.2208821770526824
