# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import os
import json

# **Downloading Dataset**

In [None]:
# Set Environment Variables
os.environ["KAGGLE_USERNAME"] = "mahdiabinteharoon"
os.environ["KAGGLE_KEY"] = "2c327de6101826d69e359d6645e7368c"

In [None]:
!kaggle datasets download tmdb/tmdb-movie-metadata

In [None]:
!unzip "/content/tmdb-movie-metadata.zip"

Archive:  /content/tmdb-movie-metadata.zip
replace tmdb_5000_credits.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# **Data Preprocessing**

Loading Dataset

In [None]:
credits = pd.read_csv("/content/tmdb_5000_credits.csv")
credits.head(2)

In [None]:
credits.shape

In [None]:
movies = pd.read_csv("/content/tmdb_5000_movies.csv")
movies.head(2)

In [None]:
movies.shape

Merging Datasets

In [None]:
movies = movies.merge(credits, on = "title")
movies.shape

Removing Unnecessary Columns

In [None]:
movies.columns

In [None]:
movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]
movies.head(2)

In [None]:
movies.columns

In [None]:
movies.shape

Handling Missing Values

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace = True)

In [None]:
movies.isnull().sum()

Handling Duplicates

In [None]:
movies.duplicated().sum()

Creating Tags

In [None]:
import ast

In [None]:
# genres
movies["genres"].head(1).values

In [None]:
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i["name"])
  return L

In [None]:
movies["genres"] = movies["genres"].apply(convert)

In [None]:
movies["genres"].iloc[0]

In [None]:
# keywords
movies["keywords"].head(1).values

In [None]:
movies["keywords"] = movies["keywords"].apply(convert)

In [None]:
movies.head(3)

In [None]:
# cast
movies["cast"].head(1).values

In [None]:
def cast_name(obj):
  L = []
  counter  = 0
  for i in ast.literal_eval(obj):
    if counter!=3:
      L.append(i["name"])
      counter+=1
    else:
      break
  return L

In [None]:
movies["cast"] = movies["cast"].apply(cast_name)

In [None]:
movies.head(3)

In [None]:
# crew
movies["crew"].head(1).values

In [None]:
def fetch_director(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i["job"]=="Director":
      L.append(i["name"])
      break
  return L

In [None]:
movies["crew"] = movies["crew"].apply(fetch_director)

In [None]:
movies.head(3)

In [None]:
# overview
movies["overview"][0]

In [None]:
movies["overview"] = movies["overview"].apply(lambda x:x.split())

In [None]:
movies.head(3)

In [None]:
movies["overview"] = movies["overview"].apply(lambda x : [i.replace(" ", "") for i in x])
movies["genres"] = movies["genres"].apply(lambda x : [i.replace(" ", "") for i in x])
movies["keywords"] = movies["keywords"].apply(lambda x : [i.replace(" ", "") for i in x])
movies["cast"] = movies["cast"].apply(lambda x : [i.replace(" ", "") for i in x])

In [None]:
movies.head(3)

In [None]:
movies["tags"] = movies["overview"] + movies["genres"] + movies["keywords"] + movies["cast"] + movies["crew"]

In [None]:
movies.head(3)

In [None]:
movies = movies[["movie_id", "title", "tags"]]

In [None]:
movies.head(3)

In [None]:
movies["tags"] = movies["tags"].apply(lambda x : " ".join(x))

In [None]:
movies["tags"][0]

In [None]:
movies["tags"] = movies["tags"].apply(lambda x: x.lower())

In [None]:
movies.head()

Stemming

In [None]:
!pip install nltk

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
def stem(text):
  L = []
  for i in text.split():
    L.append(ps.stem(i))
  return " ".join(L)

In [None]:
movies["tags"] = movies["tags"].apply(stem)

In [None]:
movies.shape

In [None]:
movies.columns

In [None]:
movies.info()

In [None]:
movies.describe()

# **Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies["tags"]).toarray()
vectors

In [None]:
vectors.shape

In [None]:
features = cv.get_feature_names_out()
for feature in features:
  print(feature)

In [None]:
len(features)

Calculate Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)
similarity

In [None]:
similarity.shape

# **Main Function**

In [None]:
def recommend(movie):
 movie_index = movies[movies["title"] == movie].index[0]
 distance = similarity[movie_index]
 movies_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x: x[1])[1:6]

 for i in movies_list:
  print(movies.iloc[i[0]].title)

In [None]:
recommend("Avatar")

In [None]:
import pickle as pkl

In [None]:
pkl.dump(movies.to_dict(), open("movies_dict.pkl", 'wb'))

In [None]:
movies["title"].values

In [None]:
pkl.dump(similarity, open("similarity.pkl", "wb"))

In [None]:
def get_poster(movie_id):
  import requests

  url = f"https://api.themoviedb.org/3/movie/{movie_id}/images"

  headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzNWI4ZWI3OWJmNWE3N2I5MmViY2I0MjU4YmMxNjJkMCIsIm5iZiI6MTc1NDQ3NzYzOC41MjcsInN1YiI6IjY4OTMzNDQ2ODYzNDU1YWE5NmRiZmQwOCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ._N-seWjsD9JpPb-TDQiCltlWPFw_ZlLbvtnA0OlgzS0"
  }

  response = requests.get(url, headers=headers)

  return response.text

In [None]:
response = get_poster(movie_id=19995)
response

In [None]:
import json
image_id = json.loads(response)["backdrops"][0]["file_path"]
image_id

In [None]:
f"https://image.tmdb.org/t/p/w500/{image_id}"