### Import Necessary libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Loading Dataset

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
movies.head(1)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


### Handle Missing Values

In [3]:
# Keep only useful columns
movies = movies[["title", "overview"]]

# Handle missing values
movies["overview"] = movies["overview"].fillna("")

### Check Shape of dataset

In [4]:
print("Movies Shape:", movies.shape)
print(movies.head())

Movies Shape: (4803, 2)
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            overview  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bond’s past sends him o...  
3  Following the death of District Attorney Harve...  
4  John Carter is a war-weary, former military ca...  


### TF-IDF Vectorization

In [5]:
# Convert overview text into numerical vectors using TF-IDF
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["overview"])

print("TF-IDF Matrix shape:", tfidf_matrix.shape)

TF-IDF Matrix shape: (4803, 20978)


### Cosine Similarity

In [6]:
# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


### Making A Function

In [7]:
# Index mapping: movie title -> index
indices = pd.Series(movies.index, index=movies["title"].str.lower()).drop_duplicates()

def recommend_movie(title, top_n=5):
    title = title.lower()
    
    if title not in indices:
        return f"Movie '{title}' not found in dataset!"
    
    # Get index of movie
    idx = indices[title]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of top_n similar movies (excluding the movie itself)
    sim_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    return movies["title"].iloc[sim_indices].tolist()


### Testing

In [8]:
print("🎬 Recommendations for 'Avatar':")
print(recommend_movie("Avatar"))

print("\n🎬 Recommendations for 'The Dark Knight Rises':")
print(recommend_movie("The Dark Knight Rises"))

print("\n🎬 Recommendations for 'The Avengers':")
print(recommend_movie("The Avengers"))


🎬 Recommendations for 'Avatar':
['Apollo 18', 'The American', 'The Matrix', 'The Inhabited Island', 'Tears of the Sun']

🎬 Recommendations for 'The Dark Knight Rises':
['The Dark Knight', 'Batman Forever', 'Batman Returns', 'Batman', 'Batman: The Dark Knight Returns, Part 2']

🎬 Recommendations for 'The Avengers':
['Avengers: Age of Ultron', 'Plastic', 'Timecop', 'This Thing of Ours', 'Thank You for Smoking']
