In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv('imdb_movies.csv')
print(df.head())

                         names       date_x  ...       revenue country
0                    Creed III  03/02/2023   ...  2.716167e+08      AU
1     Avatar: The Way of Water  12/15/2022   ...  2.316795e+09      AU
2  The Super Mario Bros. Movie  04/05/2023   ...  7.244590e+08      AU
3                      Mummies  01/05/2023   ...  3.420000e+07      AU
4                    Supercell  03/17/2023   ...  3.409420e+08      US

[5 rows x 12 columns]


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10178 entries, 0 to 10177
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   names       10178 non-null  object 
 1   date_x      10178 non-null  object 
 2   score       10178 non-null  float64
 3   genre       10093 non-null  object 
 4   overview    10178 non-null  object 
 5   crew        10122 non-null  object 
 6   orig_title  10178 non-null  object 
 7   status      10178 non-null  object 
 8   orig_lang   10178 non-null  object 
 9   budget_x    10178 non-null  float64
 10  revenue     10178 non-null  float64
 11  country     10178 non-null  object 
dtypes: float64(3), object(9)
memory usage: 954.3+ KB


In [5]:
#Handling missing values
df['overview'] = df['overview'].fillna('')

In [6]:
#Feture extraction using TF - IDF
vectorizer = TfidfVectorizer(stop_words='english') #stop_words='english' removes common words like 'the', 'a', 'an', etc.
tfidf_matrix = vectorizer.fit_transform(df['overview'])

In [7]:
#Computing cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [8]:
#Recommendation Function
def get_recommendations(title, df, cosine_sim):
    idx = df[df['names'] == title].index[0] #Get the index of the movie that matches the title
    sim_scores = list(enumerate(cosine_sim[idx])) #Get the pairwsie similarity scores of all movies with that movie
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) #Sort the movies based on the similarity scores
    sim_scores = sim_scores[1:11] #Get the scores of the 10 most similar movies
    movies_indices = [i[0] for i in sim_scores] #Get the movie indices
    return df['names'].iloc[movies_indices] #Return the top 10 most similar movies

In [9]:
#Test the recommendation function
movie_name = 'The Dark Knight'
recommendations = get_recommendations(movie_name, df, cosine_sim)
print(f"Movie similar to {movie_name} are:")
print(recommendations)

Movie similar to The Dark Knight are:
2109                  Batman: The Long Halloween, Part One
2340                  Batman: The Long Halloween, Part Two
861                                  The Dark Knight Rises
2131                                                Batman
10137                                  Batman vs. Two-Face
3807                              Batman: The Killing Joke
7864     Batman Unmasked: The Psychology of 'The Dark K...
2668               Batman: The Dark Knight Returns, Part 2
3913                            Batman: Under the Red Hood
3053                                        Batman Forever
Name: names, dtype: object
