In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pickle

In [55]:
data = pd.read_csv('movies-themoviedb.csv')

In [59]:
data.head()

Unnamed: 0,id,name,rating,genre,release_date,runtime,director,overview,url
0,1,Leo,75.82,"Animation, Comedy, Family",11/21/2023,1h 42m,Adam Sandler,Jaded 74-year-old lizard Leo has been stuck in...,https://www.themoviedb.org/movie/1075794
1,2,Trolls Band Together,71.96,"Animation, Family, Music, Fantasy, Comedy",10/19/2023,1h 32m,Anna Kendrick,"When Branch’s brother, Floyd, is kidnapped for...",https://www.themoviedb.org/movie/901362
2,3,Oppenheimer,81.57,"Drama, History",07/20/2023,3h 1m,Cillian Murphy,The story of J. Robert Oppenheimer's role in t...,https://www.themoviedb.org/movie/872585
3,4,Five Nights at Freddy's,78.49,"Horror, Mystery",10/26/2023,1h 50m,Josh Hutcherson,"Recently fired and desperate for work, a troub...",https://www.themoviedb.org/movie/507089
4,5,The Creator,71.29,"Science Fiction, Action, Thriller",09/28/2023,2h 14m,John David Washington,Amid a future war between the human race and t...,https://www.themoviedb.org/movie/670292


In [57]:
# Add an ID column to the DataFrame
data['id'] = range(1, len(data) + 1)

# Move the 'id' column to the front
data = data[['id'] + [col for col in data.columns if col != 'id']]

In [58]:
data.overview = data.overview.apply(lambda x : x.replace('\n', ''))

In [60]:
# Concatenate title,director,genre,description in to a single feature.

def get_important_features(data):
    important_features = []
    for i in range(data.shape[0]):
        # Check for missing values and replace them with empty strings
        name = str(data['name'][i]) if not pd.isnull(data['name'][i]) else ''
        director = str(data['director'][i]) if not pd.isnull(data['director'][i]) else ''
        genre = str(data['genre'][i]) if not pd.isnull(data['genre'][i]) else ''
        overview = str(data['overview'][i]) if not pd.isnull(data['overview'][i]) else ''

        important_features.append(name + ' ' + director + ' ' + genre + ' ' + overview)

    return important_features


In [61]:
#creating a column to hold the combined strings
data['important_features'] = get_important_features(data)

In [62]:
data.head()

Unnamed: 0,id,name,rating,genre,release_date,runtime,director,overview,url,important_features
0,1,Leo,75.82,"Animation, Comedy, Family",11/21/2023,1h 42m,Adam Sandler,Jaded 74-year-old lizard Leo has been stuck in...,https://www.themoviedb.org/movie/1075794,"Leo Adam Sandler Animation, Comedy, Family Jad..."
1,2,Trolls Band Together,71.96,"Animation, Family, Music, Fantasy, Comedy",10/19/2023,1h 32m,Anna Kendrick,"When Branch’s brother, Floyd, is kidnapped for...",https://www.themoviedb.org/movie/901362,"Trolls Band Together Anna Kendrick Animation, ..."
2,3,Oppenheimer,81.57,"Drama, History",07/20/2023,3h 1m,Cillian Murphy,The story of J. Robert Oppenheimer's role in t...,https://www.themoviedb.org/movie/872585,"Oppenheimer Cillian Murphy Drama, History The ..."
3,4,Five Nights at Freddy's,78.49,"Horror, Mystery",10/26/2023,1h 50m,Josh Hutcherson,"Recently fired and desperate for work, a troub...",https://www.themoviedb.org/movie/507089,Five Nights at Freddy's Josh Hutcherson Horror...
4,5,The Creator,71.29,"Science Fiction, Action, Thriller",09/28/2023,2h 14m,John David Washington,Amid a future war between the human race and t...,https://www.themoviedb.org/movie/670292,The Creator John David Washington Science Fict...


In [17]:
data.shape

(200, 9)

In [63]:
tfidf = TfidfVectorizer(stop_words='english')
#data['Description'] = data['Description'].fillna('')
tfidf_matrix = tfidf.fit_transform(data['important_features'])
tfidf_matrix.shape

(200, 3296)

In [18]:
num_documents , num_words = tfidf_matrix.shape
print(f"Number of documents: {num_documents}")
print(f"Number of unique words: {num_words}")

Number of documents: 200
Number of unique words: 3296


In [64]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
cosine_sim

array([[1.        , 0.01830929, 0.        , ..., 0.00535817, 0.02326138,
        0.        ],
       [0.01830929, 1.        , 0.        , ..., 0.00525136, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.023673  , 0.        ,
        0.        ],
       ...,
       [0.00535817, 0.00525136, 0.023673  , ..., 1.        , 0.00779854,
        0.01333483],
       [0.02326138, 0.        , 0.        , ..., 0.00779854, 1.        ,
        0.01173302],
       [0.        , 0.        , 0.        , ..., 0.01333483, 0.01173302,
        1.        ]])

In [69]:
# logic that takes a movie title as input and returns the top 5 similar movies based on cosine similarity.

indices = pd.Series(data.index, index=data['name']).drop_duplicates()
#indices['Stillwater']
#sim_scores = list(enumerate(cosine_sim[indices['Stillwater']]))
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the movies based on the similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar movies
    movies=data['name'].iloc[movie_indices]
    id=data['id'].iloc[movie_indices]
    dict={"Movies":movies,"id":id}
    final_df=pd.DataFrame(dict)
    final_df.reset_index(drop=True,inplace=True)
    return final_df

In [71]:
# Test our recommendation engine
get_recommendations('Oppenheimer')


Unnamed: 0,Movies,id
0,Princess Khutulun,15
1,Napoleon,11
2,Avengers: Infinity War,53
3,The Nun II,40
4,Jurassic World Dominion,127


In [66]:
data.head()

Unnamed: 0,id,name,rating,genre,release_date,runtime,director,overview,url,important_features
0,1,Leo,75.82,"Animation, Comedy, Family",11/21/2023,1h 42m,Adam Sandler,Jaded 74-year-old lizard Leo has been stuck in...,https://www.themoviedb.org/movie/1075794,"Leo Adam Sandler Animation, Comedy, Family Jad..."
1,2,Trolls Band Together,71.96,"Animation, Family, Music, Fantasy, Comedy",10/19/2023,1h 32m,Anna Kendrick,"When Branch’s brother, Floyd, is kidnapped for...",https://www.themoviedb.org/movie/901362,"Trolls Band Together Anna Kendrick Animation, ..."
2,3,Oppenheimer,81.57,"Drama, History",07/20/2023,3h 1m,Cillian Murphy,The story of J. Robert Oppenheimer's role in t...,https://www.themoviedb.org/movie/872585,"Oppenheimer Cillian Murphy Drama, History The ..."
3,4,Five Nights at Freddy's,78.49,"Horror, Mystery",10/26/2023,1h 50m,Josh Hutcherson,"Recently fired and desperate for work, a troub...",https://www.themoviedb.org/movie/507089,Five Nights at Freddy's Josh Hutcherson Horror...
4,5,The Creator,71.29,"Science Fiction, Action, Thriller",09/28/2023,2h 14m,John David Washington,Amid a future war between the human race and t...,https://www.themoviedb.org/movie/670292,The Creator John David Washington Science Fict...


In [72]:
# data.info()
new_data = data.drop(columns=['release_date','runtime','genre','director','url','rating','overview'])

In [73]:
new_data.head()

Unnamed: 0,id,name,important_features
0,1,Leo,"Leo Adam Sandler Animation, Comedy, Family Jad..."
1,2,Trolls Band Together,"Trolls Band Together Anna Kendrick Animation, ..."
2,3,Oppenheimer,"Oppenheimer Cillian Murphy Drama, History The ..."
3,4,Five Nights at Freddy's,Five Nights at Freddy's Josh Hutcherson Horror...
4,5,The Creator,The Creator John David Washington Science Fict...


In [74]:
new_data.shape

(200, 3)

In [75]:
pickle.dump(new_data,open('movie_list.pkl','wb'))
pickle.dump(cosine_sim,open('similarity.pkl','wb'))