In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
movie_df = pd.read_csv("movies.csv")
movie_keywords = pd.read_csv("keywords.csv")
movie_credits = pd.read_csv("credits.csv")

In [3]:
movie_df = movie_df[['id','original_title','overview','genres']]
movie_df.head(5)

Unnamed: 0,id,original_title,overview,genres
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '..."
4,524,Casino,The life of the gambling paradise ‚Äì Las Vega...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name..."


In [4]:
movie_df["title"] = movie_df["original_title"].copy()
movie_df.reset_index(inplace=True, drop=True)

In [5]:
movie_keywords.head(5)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [6]:
movie_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",949
3,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",710
4,"[{'cast_id': 4, 'character': ""Sam 'Ace' Rothst...","[{'credit_id': '52fe424dc3a36847f80139cd', 'de...",524


In [7]:
movie_df["id"] = movie_df["id"].astype(int)
df = pd.merge(movie_df, movie_keywords, on="id")

In [8]:
df = pd.merge(df, movie_credits, on="id")

In [9]:
df.isna().sum()

id                0
original_title    0
overview          0
genres            0
title             0
keywords          0
cast              0
crew              0
dtype: int64

In [10]:
df["overview"] = df["overview"].fillna("[]")

In [11]:
df["genres"] = df["genres"].apply(lambda x: [i["name"] for i in eval(x)])
df["genres"] = df["genres"].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [12]:
df["keywords"] = df["keywords"].apply(lambda x: [i["name"] for i in eval(x)])
df["keywords"] = df["keywords"].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [13]:
df.head(5)

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,Toy Story,jealousy toy boy friendship friends rivalry bo...,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Jumanji,boardgame disappearance basedonchildren'sbook ...,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Action Crime Drama Thriller,Heat,robbery detective bank obsession chase shootin...,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,Adventure Action Thriller,GoldenEye,cuba falselyaccused secretidentity computervir...,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."
4,524,Casino,The life of the gambling paradise ‚Äì Las Vega...,Drama Crime,Casino,poker drugabuse 1970s overdose illegalprostitu...,"[{'cast_id': 4, 'character': ""Sam 'Ace' Rothst...","[{'credit_id': '52fe424dc3a36847f80139cd', 'de..."


In [14]:
from ast import literal_eval

In [15]:
df["director"] = df["crew"].apply(literal_eval).apply(lambda x: [i["name"] for i in x if i["job"]=="Director"])
df["director"] = df["director"].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [16]:
df.drop("crew", axis=1, inplace=True)

In [17]:
df["cast"] = df["cast"].apply(lambda x: [i["name"][:5] for i in eval(x)])
df["cast"] = df["cast"].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [18]:
#add overview, genres, original_title, keywords, cast , and director column 3 times to put more emphasis on director's role
df["tags"] = df["overview"] + " " + df["genres"] + " " + df["original_title"] + " " + df["keywords"] + " " + df["cast"] + " " + df["director"] + " " +  df["director"] + " " + df["director"]

In [19]:
df["tags"][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Animation Comedy Family Toy Story jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife TomH TimA DonR JimV Walla John Annie John Erik Lauri R.Le Sarah Penn JohnLasseter JohnLasseter JohnLasseter"

In [20]:
df.drop(columns=["overview", "original_title", "keywords", "cast"], axis=1, inplace=True)
df.head()

Unnamed: 0,id,genres,title,director,tags
0,862,Animation Comedy Family,Toy Story,JohnLasseter,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Adventure Fantasy Family,Jumanji,JoeJohnston,When siblings Judy and Peter discover an encha...
2,949,Action Crime Drama Thriller,Heat,MichaelMann,"Obsessive master thief, Neil McCauley leads a ..."
3,710,Adventure Action Thriller,GoldenEye,MartinCampbell,James Bond must unmask the mysterious head of ...
4,524,Drama Crime,Casino,MartinScorsese,The life of the gambling paradise ‚Äì Las Vega...


In [21]:
df.isnull().sum()

id          0
genres      0
title       0
director    0
tags        0
dtype: int64

In [22]:
df.drop_duplicates(inplace=True)

In [23]:
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
vectorized_df = tfidf.fit_transform(df["tags"])

In [24]:
vectorized_df.shape

(1699, 139064)

In [25]:
cosine_sim = cosine_similarity(vectorized_df)

In [26]:
cosine_sim 

array([[1.        , 0.00778013, 0.        , ..., 0.        , 0.0025239 ,
        0.01334829],
       [0.00778013, 1.        , 0.00977925, ..., 0.01320638, 0.0024184 ,
        0.00442256],
       [0.        , 0.00977925, 1.        , ..., 0.00485414, 0.00985989,
        0.01010123],
       ...,
       [0.        , 0.01320638, 0.00485414, ..., 1.        , 0.02800013,
        0.00699531],
       [0.0025239 , 0.0024184 , 0.00985989, ..., 0.02800013, 1.        ,
        0.00905847],
       [0.01334829, 0.00442256, 0.01010123, ..., 0.00699531, 0.00905847,
        1.        ]])

In [27]:
def recommendation_title(title):
    id_recom = df[df["title"]==title].index[0]
    distances = cosine_sim[id_recom]
    top_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    movie_list = []
    ids = []
    for i in top_list:
        movie_list.append(df.iloc[i[0]].title)
        ids.append(df.iloc[i[0]].id)
    return movie_list

In [28]:
recommendation_title("Toy Story")

['Toy Story 2', 'Toy Story 3', "A Bug's Life", 'Cars', 'Cars 2']

In [29]:
def recommendation_genre(genre):
    id_recom = df[df["genres"]==genre].index[0]
    distances = cosine_sim[id_recom]
    top_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    movie_list = []
    ids = []
    for i in top_list:
        movie_list.append(df.iloc[i[0]].title)
        ids.append(df.iloc[i[0]].id)
    return movie_list

In [30]:
recommendation_genre("Adventure")

['Waterworld',
 'Robin Hood',
 'Robin Hood',
 'Tomorrow Never Dies',
 'Rogue One: A Star Wars Story']

In [31]:
def recommendation_director(director):
    id_recom = df[df["director"]==director].index[0]
    distances = cosine_sim[id_recom]
    top_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    movie_list = []
    ids = []
    for i in top_list:
        movie_list.append(df.iloc[i[0]].title)
        ids.append(df.iloc[i[0]].id)
    return movie_list

In [32]:
recommendation_director("SimonWells")

['The Prince of Egypt',
 'Hot Tub Time Machine',
 'Cloud Atlas',
 'Back to the Future Part III',
 'Men in Black 3']

In [33]:
import pickle

In [34]:
pickle.dump(df.to_dict(),open('movies.pkl','wb'))

In [35]:
pickle.dump(cosine_sim,open('similarity.pkl','wb'))

In [36]:
recommendation_title("Toy Story")

['Toy Story 2', 'Toy Story 3', "A Bug's Life", 'Cars', 'Cars 2']

In [38]:
recommendation_genre("Comedy")

['Grown Ups',
 'Big Daddy',
 'Jack and Jill',
 'Just Go with It',
 'I Now Pronounce You Chuck & Larry']

In [39]:
recommendation_director("SimonWells")

['The Prince of Egypt',
 'Hot Tub Time Machine',
 'Cloud Atlas',
 'Back to the Future Part III',
 'Men in Black 3']