In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

In [2]:
# Load the movies data
movies_data =pd.read_csv('G:/Movie Recommendation System/movies.csv')

# Display the first few rows of the dataset
print("Movies Data Head:\n", movies_data.head())

# Display the shape of the dataset
print("Shape of the Dataset:", movies_data.shape)

Movies Data Head:
    index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy

In [3]:
# Selecting relevant features for the recommendation system
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']
print("Selected Features:", selected_features)

Selected Features: ['genres', 'keywords', 'tagline', 'cast', 'director']


In [4]:
# Display information about the dataset
print("Movies Data Info:\n", movies_data.info())

# Check for null values in the dataset
print("Null Values in Dataset:\n", movies_data.isna().sum())

# Display the selected columns
print("Selected Columns Head:\n", movies_data[selected_features].head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [5]:
# Check for null values in the selected columns
print("Null Values in Selected Columns:\n", movies_data[selected_features].isna().sum())

# Replacing the null values with an empty string
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')

# Display the first few rows of the updated dataset
print("Updated Movies Data Head:\n", movies_data.head())


Null Values in Selected Columns:
 genres       28
keywords    412
tagline     844
cast         43
director     30
dtype: int64
Updated Movies Data Head:
    index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future sp

In [6]:
# Combining all the selected features into a single string
combined_features = movies_data['genres'] + ' ' + movies_data['keywords'] + ' ' + movies_data['tagline'] + ' ' + movies_data['cast'] + ' ' + movies_data['director']
print("Combined Features:\n", combined_features)


Combined Features:
 0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [7]:
# Converting the text data to feature vectors using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

In [8]:
# Display the shape of the feature vectors
print("Shape of Feature Vectors:", feature_vectors.shape)

Shape of Feature Vectors: (4803, 17318)


In [9]:
# Calculate the cosine similarity between the feature vectors
similarity = cosine_similarity(feature_vectors)

In [10]:
# Display the shape of the similarity matrix
print("Shape of Cosine Similarity Matrix:", similarity.shape)


Shape of Cosine Similarity Matrix: (4803, 4803)


In [11]:
# Get the movie name from the user
movie_name = input('Enter your favourite movie name: ')

Enter your favourite movie name: Avatar


In [12]:
# Create a list with all the movie names from the dataset
list_of_all_titles = movies_data['title'].tolist()

# Find the closest match for the movie name given by the user
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

In [13]:
# If a close match is found, use the first match
if find_close_match:
    close_match = find_close_match[0]
    print("Close Match Found:", close_match)

    # Find the index of the movie with the closest match
    index_of_the_movie = movies_data[movies_data.title == close_match].index[0]
    print("Index of the Movie:", index_of_the_movie)

    # Get the similarity scores for the movie
    similarity_score = list(enumerate(similarity[index_of_the_movie]))

    # Sort the movies based on their similarity score
    sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    # Print the top 30 similar movies
    print('Movies suggested for you:\n')
    i = 1
    for movie in sorted_similar_movies[1:31]:  # Skipping the first one as it is the same movie
        index = movie[0]
        title_from_index = movies_data.iloc[index]['title']
        print(f"{i}. {title_from_index}")
        i += 1
else:
    print("No close match found. Please try another movie name.")


Close Match Found: Avatar
Index of the Movie: 0
Movies suggested for you:

1. Alien
2. Aliens
3. Guardians of the Galaxy
4. Star Trek Beyond
5. Star Trek Into Darkness
6. Galaxy Quest
7. Alien³
8. Cargo
9. Trekkies
10. Gravity
11. Moonraker
12. Jason X
13. Pocahontas
14. Space Cowboys
15. The Helix... Loaded
16. Lockout
17. Event Horizon
18. Space Dogs
19. Machete Kills
20. Gettysburg
21. Clash of the Titans
22. Star Wars: Clone Wars: Volume 1
23. The Right Stuff
24. Terminator Salvation
25. The Astronaut's Wife
26. Planet of the Apes
27. Star Trek
28. Wing Commander
29. Sunshine
30. The Terminator
