In [938]:
# Install the Kaggle API client
!pip install kaggle

In [939]:
# Set up the Kaggle credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [940]:
# Download the dataset from Kaggle
!kaggle datasets download -d tmdb/tmdb-movie-metadata

In [941]:
# Python code to extract the dataset
from zipfile import ZipFile

In [942]:
# Correct the path to the ZIP file
dataset = 'tmdb-movie-metadata.zip'

In [943]:
# Extract the dataset
with ZipFile(dataset, 'r') as zip:
    zip.extractall()
    print('The dataset is extracted')

In [944]:
import numpy as np
import pandas as pd

In [945]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [946]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,vote_average,vote_count,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",James Cameron,7.2,11800,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",Gore Verbinski,6.9,4500,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",Sam Mendes,6.3,4466,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",Christopher Nolan,7.6,9106,following the death of district attorney harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",Andrew Stanton,6.1,2124,"john carter is a war-weary, former military ca..."


In [947]:
credits.head()

In [948]:
movies = movies.merge(credits, on='title')

In [949]:
movies.head()

ValueError: operands could not be broadcast together with shapes (4806,4806) (480,480) 

In [950]:
# genres, id, keywords, title, overview, cast, crew

In [951]:
movies.info()

In [952]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.head()

In [953]:
movies.isnull().sum()
movies.dropna(inplace=True)
movies.duplicated().sum()

In [954]:
movies.iloc[0].genres

In [955]:
# '[{"id": 28, "name": "Action"}, 
# {"id": 12, "name": "Adventure"}, 
# {"id": 14, "name": "Fantasy"}, 
# {"id": 878, "name": "Science Fiction"}]'

In [956]:
import ast 

In [957]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [958]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [959]:
def convertCast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 5:
            L.append(i['name'])
            counter += 1
        else: 
            break
    return L

In [960]:
movies['cast'] = movies['cast'].apply(convertCast)

In [961]:
def extractDirector(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            L.append(i['name'])
    return L

In [962]:
movies['crew'] = movies['crew'].apply(extractDirector)

In [963]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [964]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ", "") for i in x])

In [965]:
movies.head()

In [966]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [967]:
movies = movies[['movie_id', 'title', 'tags']]
movies.head()

In [968]:
movies['tags'] = movies['tags'].apply(lambda x:" ".join(x))

In [969]:
movies['tags'] = movies['tags'].apply(lambda x:x.lower())
movies['tags'].head()

In [970]:
import nltk
from nltk.stem.porter import PorterStemmer
PorterStemmer = PorterStemmer()

In [971]:
def stem(text):
    y = []
    for i in text.split():
        y.append(PorterStemmer.stem(i))
    string = " ".join(y)
    return string 

In [972]:
movies['tags'] = movies['tags'].apply(stem)

In [973]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [974]:
vectors = cv.fit_transform(movies['tags']).toarray()
vectors[0]

In [975]:
cv = cv.get_feature_names_out()
stem(movies['tags'][0])

In [976]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [977]:
def recommend(movie):
    
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1])[1:6]
                           
    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [978]:
recommend('Avatar')

In [979]:
import pickle

In [980]:
pickle.dump(movies.to_dict(), open('movies_dict.pkl', 'wb'))

In [981]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))