In [5]:
!mv kaggle.json /root/.kaggle/

In [6]:
!kaggle datasets download -d disham993/9000-movies-dataset

Downloading 9000-movies-dataset.zip to /content
  0% 0.00/1.70M [00:00<?, ?B/s]
100% 1.70M/1.70M [00:00<00:00, 60.9MB/s]


In [7]:
!unzip 9000-movies-dataset.zip

Archive:  9000-movies-dataset.zip
  inflating: mymoviedb.csv           


In [8]:
import pandas as pd

dataset = pd.read_csv("mymoviedb.csv",lineterminator="\n")
dataset.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [None]:
data = dataset[['Title','Overview','Genre']]
data.head()

In [10]:
data.shape

(9827, 3)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
data['Overview'] = data['Overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(data['Overview'])
tfidf_matrix.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Overview'] = data['Overview'].fillna('')


(9827, 28483)

In [13]:
# model learning
from sklearn.metrics.pairwise import linear_kernel

model = linear_kernel(tfidf_matrix, tfidf_matrix)

In [14]:
# mapping movie name to the indices
indices = pd.Series(data.index,index=data['Title']).drop_duplicates()
indices[0:10]

Title
Spider-Man: No Way Home    0
The Batman                 1
No Exit                    2
Encanto                    3
The King's Man             4
The Commando               5
Scream                     6
Kimi                       7
Fistful of Vengeance       8
Eternals                   9
dtype: int64

In [15]:
def get_recommendations(title,model=model):
    idx = indices[title]
    sim_scores = list(enumerate(model[idx]))
    sim_scores.sort(key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    similar_movies = data['Title'].iloc[movie_indices].tolist()

    return similar_movies

In [17]:
get_recommendations("Tiempo")

['Little Nemo: Adventures in Slumberland',
 'Coming Home in the Dark',
 'Naruto Shippuden the Movie: Bonds']

In [18]:
# saving the model
model_name = "recommendation_model"

import joblib

joblib.dump(model,model_name)

['recommendation_model']

In [19]:
# trying out the saved model
saved_model = joblib.load(model_name)
get_recommendations("Tiempo",model=saved_model)

['Little Nemo: Adventures in Slumberland',
 'Coming Home in the Dark',
 'Naruto Shippuden the Movie: Bonds']