# Importing usefull libraries

In [191]:
import numpy as np # For numerical computation
import pandas as pd # For table related tasks
import ast # For cleaning some columns 

import difflib # For finding similar titles

# For string vector conversion and similarity calculation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For filtering unnecesary warnings
import warnings

warnings.filterwarnings('ignore')

import pickle # For model saving

# Reading the data

In [52]:
# Let us read the two data using pandas

movie_data = pd.read_csv('../data/tmdb_5000_movies.csv')
credit_data = pd.read_csv('../data/tmdb_5000_credits.csv')

In [53]:
# Let us see the movie data

movie_data.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [54]:
# Let us see what are the columns of movie data

movie_data.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

In [55]:
# Let us see the credits data

credit_data.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


# EDA

In [56]:
# So now let us merge the credit data and the movie data based on the movie title

movies = movie_data.merge(credit_data,on='title')
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [57]:
# Now let us see how many missing values we have in our dataset

movies.isna().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

In [58]:
# Since we have a lot of missing values on the 'homepage' and 'tagline' column lets drop them

movies.drop(columns=['homepage','tagline'],inplace=True)
movies.isna().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                3
popularity              0
production_companies    0
production_countries    0
release_date            1
revenue                 0
runtime                 2
spoken_languages        0
status                  0
title                   0
vote_average            0
vote_count              0
movie_id                0
cast                    0
crew                    0
dtype: int64

In [59]:
# Now let us drop all rows containing missing values

movies.dropna(inplace=True)
movies.isna().sum()

budget                  0
genres                  0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
title                   0
vote_average            0
vote_count              0
movie_id                0
cast                    0
crew                    0
dtype: int64

In [60]:
# Let us see how many columns we have in our dataset now

len(movies.columns)

21

In [61]:
# Let us take only some of the columns (Feature Selection)

usefull_columns = ['movie_id','title','overview','genres','keywords','cast','crew']
usefull_columns

['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']

In [62]:
# Now let us create a new dataframe with the usefull columns

movies_df = movies[usefull_columns]
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [63]:
# Let us see the type of each column data

for col in movies_df.columns:
    print(f'The type of values in {col} is \
{type(movies_df[col][0])}')

The type of values in movie_id is <class 'numpy.int64'>
The type of values in title is <class 'str'>
The type of values in overview is <class 'str'>
The type of values in genres is <class 'str'>
The type of values in keywords is <class 'str'>
The type of values in cast is <class 'str'>
The type of values in crew is <class 'str'>


In [64]:
# Let us do some modification to the keywords column
# It is a list but in the table it is treated as a column
# So let use the ast(Abstract Syntax Tree) module to clean the data
# First let us see how the ast module works

print(type(movies_df.loc[0]['keywords']))
cleaned = ast.literal_eval(movies_df.loc[0]['keywords'])
print(type(cleaned))

<class 'str'>
<class 'list'>


In [65]:
# Let us see the effect of ast

movies_df.loc[0]['keywords']

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [66]:
# Let us the how the ast cleaned the above string

cleaned

[{'id': 1463, 'name': 'culture clash'},
 {'id': 2964, 'name': 'future'},
 {'id': 3386, 'name': 'space war'},
 {'id': 3388, 'name': 'space colony'},
 {'id': 3679, 'name': 'society'},
 {'id': 3801, 'name': 'space travel'},
 {'id': 9685, 'name': 'futuristic'},
 {'id': 9840, 'name': 'romance'},
 {'id': 9882, 'name': 'space'},
 {'id': 9951, 'name': 'alien'},
 {'id': 10148, 'name': 'tribe'},
 {'id': 10158, 'name': 'alien planet'},
 {'id': 10987, 'name': 'cgi'},
 {'id': 11399, 'name': 'marine'},
 {'id': 13065, 'name': 'soldier'},
 {'id': 14643, 'name': 'battle'},
 {'id': 14720, 'name': 'love affair'},
 {'id': 165431, 'name': 'anti war'},
 {'id': 193554, 'name': 'power relations'},
 {'id': 206690, 'name': 'mind and soul'},
 {'id': 209714, 'name': '3d'}]

In [67]:
# So now that we see the effect of the ast module let us do it for our dataframe
# We don't need the id on the keyword we only need the name
# Let us create a function to take all the names and put them in a list

def convert_and_take_name(text):
    
    names = []
    for value in ast.literal_eval(text):
        names.append(value['name'])
    
    return names

In [68]:
# Now let us apply the above function into the keywords column

movies_df['keywords'] = movies_df['keywords'].apply(convert_and_take_name)
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [70]:
# For the genres column we also need only the name column so let us do the same

movies_df['genres'] = movies_df['genres'].apply(convert_and_take_name)
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [71]:
# Let us also do the same for the cast column

movies_df['cast'] = movies_df['cast'].apply(convert_and_take_name)
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [74]:
# Let us see how many casts are in our top 10 movies in the data

number_of_casts = []
for i in range(10):
    number_of_casts.append(len(movies_df.loc[i]['cast']))
number_of_casts

[83, 34, 83, 158, 27, 143, 13, 72, 49, 152]

In [75]:
# Since we have different number of casts let us take only the top 4 casts

movies_df['cast'] = movies_df['cast'].apply(lambda x:x[:4])
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [76]:
# Now let us see if we have common number of casts

number_of_casts = []
for i in range(10):
    number_of_casts.append(len(movies_df.loc[i]['cast']))
number_of_casts

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

In [79]:
# Let us see the crew column

movies_df.loc[0]['crew']

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [84]:
# We need a list not a string so let us use the ast module

cleaned = ast.literal_eval(movies_df.loc[0]['crew'])
cleaned[:5]

[{'credit_id': '52fe48009251416c750aca23',
  'department': 'Editing',
  'gender': 0,
  'id': 1721,
  'job': 'Editor',
  'name': 'Stephen E. Rivkin'},
 {'credit_id': '539c47ecc3a36810e3001f87',
  'department': 'Art',
  'gender': 2,
  'id': 496,
  'job': 'Production Design',
  'name': 'Rick Carter'},
 {'credit_id': '54491c89c3a3680fb4001cf7',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Sound Designer',
  'name': 'Christopher Boyes'},
 {'credit_id': '54491cb70e0a267480001bd0',
  'department': 'Sound',
  'gender': 0,
  'id': 900,
  'job': 'Supervising Sound Editor',
  'name': 'Christopher Boyes'},
 {'credit_id': '539c4a4cc3a36810c9002101',
  'department': 'Production',
  'gender': 1,
  'id': 1262,
  'job': 'Casting',
  'name': 'Mali Finn'}]

In [86]:
# For our model we need to take only the crew which are directors

directors = []
for crew in cleaned:
    if crew['job'] == 'Director':
        directors.append(crew['name'])
directors

['James Cameron']

In [87]:
# So now let us do this for each column
# Let us create a function to do that

def find_director(text):
    
    directors = []
    for crew in ast.literal_eval(text):
        if crew['job'] == 'Director':
            directors.append(crew['name'])
    
    return directors

In [88]:
# Let us apply the above function to the crew column

movies_df['crew'] = movies_df['crew'].apply(find_director)
movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [94]:
# Let us remove all the spaces in our columns
# But let us keep the overview and title columns 
# Let us create a function to do that

def replace_space(data):
    
    cleaned = []
    
    for value in data:
        cleaned.append(value.replace(' ',''))
        
    return cleaned

In [97]:
# Let us apply the function to the columns

for col in movies_df.columns:
    if col != 'title' and col != 'overview' and col != 'movie_id':
        movies_df[col] = movies_df[col].apply(replace_space)

In [98]:
# Let us check if it was succesful

movies_df.sample(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
1070,9981,Kicking & Screaming,Phil Weston has been unathletic his entire lif...,"[Romance, Comedy, Family]","[fathersonrelationship, generationsconfilct, s...","[WillFerrell, RobertDuvall, KateWalsh, Musetta...",[JesseDylan]
3559,91586,Insidious: Chapter 2,The haunted Lambert family seeks to uncover th...,"[Horror, Thriller]","[hauntedhouse, possession, demon, family, ghos...","[PatrickWilson, RoseByrne, TySimpkins, LinShaye]",[JamesWan]
532,2539,Spanglish,Mexican immigrant and single mother Flor Moren...,[Comedy],"[upperclass, mother, singleparent, parentskids...","[AdamSandler, TéaLeoni, PazVega, ClorisLeachman]",[JamesL.Brooks]
4460,11446,Welcome to the Dollhouse,An unattractive 7th grader struggles to cope w...,"[Comedy, Drama]","[parentskidsrelationship, sistersisterrelation...","[HeatherMatarazzo, VictoriaDavis, ChristinaBru...",[ToddSolondz]
1077,11359,The Indian in the Cupboard,A nine-year-old boy gets a plastic Indian and ...,"[Adventure, Family, Fantasy]","[cupboard, games, puppet, parallelworld, toyco...","[HalScardino, Litefoot, LindsayCrouse, Richard...",[FrankOz]


In [102]:
# Let us merge all the columns except title and movie_id into tags column
# But before doing that let us change the oveview string column into list

movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())
movies_df.sample(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
2001,2787,Pitch Black,"[When, their, ship, crash-lands, on, a, remote...","[Thriller, ScienceFiction, Action]","[darkness, dystopia, comet, alienlife-form, su...","[VinDiesel, RadhaMitchell, ColeHauser, LewisFi...",[DavidTwohy]
3866,241766,Lords of London,"[Tony, is, a, notorious, gangster, with, a, bi...","[Crime, Mystery, Thriller]","[wifehusbandrelationship, gangster, mysterious...","[GlenMurphy, RayWinstone, GiovanniCapalbo, Ser...",[AntonioSimoncini]
803,8840,DragonHeart,"[In, an, ancient, time, when, majestic, fire-b...",[Fantasy],"[magic, kingdom, despot, immortality, village,...","[DennisQuaid, DavidThewlis, PetePostlethwaite,...",[RobCohen]
368,76285,Percy Jackson: Sea of Monsters,"[In, their, quest, to, confront, the, ultimate...","[Adventure, Family, Fantasy]","[poison, hermes, poseidon, demigod, goldenflee...","[LoganLerman, AlexandraDaddario, DouglasSmith,...",[ThorFreudenthal]
3940,18892,Jawbreaker,"[3, of, Reagan, High, School's, most, popular,...",[Comedy],"[confession, jealousy, nightmare, ambition, gr...","[RoseMcGowan, JulieBenz, RebeccaGayheart, Caro...",[DarrenStein]


In [103]:
# Now let us combine the columns into one column and drop those columns

movies_df['tags'] = movies_df['overview'] + movies_df['genres'] \
                    + movies_df['keywords'] + movies_df['cast'] \
                    + movies_df['crew']
final_df = movies_df.drop(columns = ['overview','genres','keywords','cast','crew'])
final_df.sample(5)

Unnamed: 0,movie_id,title,tags
289,11688,The Emperor's New Groove,"[Kuzco, is, a, self-centered, emperor, who, su..."
4026,25636,Jesus' Son,"[A, young, man, turns, from, drug, addiction, ..."
2403,205,Hotel Rwanda,"[Inspired, by, true, events,, this, film, take..."
1517,12920,Dreamer: Inspired By a True Story,"[Ben, Crane, believes, that, a, severely, inju..."
29,37724,Skyfall,"[When, Bond's, latest, assignment, goes, grave..."


In [104]:
# Now let us make the list tag into string back

final_df['tags'] = final_df['tags'].apply(lambda x:' '.join(x))
final_df.sample(5)

Unnamed: 0,movie_id,title,tags
360,2253,Valkyrie,"Wounded in Africa during World War II, Nazi Co..."
4558,117942,Girls Gone Dead,A group of six ex-high school cheerleaders are...
2172,577,To Die For,Susan wants to work in television and will the...
2158,38,Eternal Sunshine of the Spotless Mind,"Joel Barish, heartbroken that his girlfriend u..."
4091,55604,Boom Town,McMasters and Sand come to oil towns to get ri...


# Creating the recommendation system

In [113]:
# Let us vectorize the tags column

cv = CountVectorizer(max_features=5000,
                    stop_words='english')
vector = cv.fit_transform(final_df['tags']).toarray()
vector.shape

(4805, 5000)

In [115]:
# Now that we have the vectorized version of our tags let us creat a similarity vector

similarity = cosine_similarity(vector)
similarity.shape

(4805, 4805)

In [129]:
# Let us see how we can get the index of the movie

index_ = final_df[final_df['title'] == 'Superman Returns'].index[0]


10

In [139]:
# Let us see the similarity vector for this index and find the similar movies

similaity_vec = list(enumerate(similarity[index_]))
similarity_vec = sorted(similaity_vec,reverse=True,key = lambda x:x[1])
for i in range(1,10):
    title = final_df.loc[similarity_vec[i][0]]['title']
    print(f'{i} -- {title}')

1 -- Superman II
2 -- Superman III
3 -- Superman IV: The Quest for Peace
4 -- Superman
5 -- Man of Steel
6 -- Avengers: Age of Ultron
7 -- X-Men: Days of Future Past
8 -- Batman v Superman: Dawn of Justice
9 -- The Crow


In [173]:
# Let us create a list of all movie titles

movie_titles = list(final_df['title'].values)
movie_titles[:10]

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice']

In [175]:
# Let us see how difflib module finds the best match to our passed movie name

movie = 'spider man'
closest_match_to_the_input = difflib.get_close_matches(movie,
                                                       movie_titles)
print(closest_match_to_the_input)
print(f'The first top match is {closest_match_to_the_input[0]}')

['Spider-Man', 'Inside Man', 'Superman']
The first top match is Spider-Man


In [178]:
# Now we are ready to create the recommendation system given a movie title

def recommend(movie):
    
    try:
        
        closest_match_to_the_input = difflib.get_close_matches(movie,
                                                       movie_titles)
        movie = closest_match_to_the_input[0]
        index_ = final_df[final_df['title'] == movie].index[0]
    
        similaity_vec = list(enumerate(similarity[index_]))
        similarity_vec = sorted(similaity_vec,reverse=True,key = lambda x:x[1])

        print('**************************************************')
        print(f'These are the top 10 recommended movies for {movie}:')
        print('**************************************************')
        for i in range(1,10):
            title = final_df.loc[similarity_vec[i][0]]['title']
            print(f'{i} -- {title}')
            
    except:
        print('You passed a movie that is not in our dataset')

In [182]:
sample = final_df.sample(5)
sample

Unnamed: 0,movie_id,title,tags
3475,92591,Bernie,"In this true story in the tiny, rural town of ..."
1822,239571,The Best of Me,A pair of former high school sweethearts reuni...
3035,9362,Tremors,Hick handymen Val McKee and Earl Bassett can b...
4020,24748,Hud,Hud Bannon is a ruthless young man who tarnish...
4187,58492,The Greatest Movie Ever Sold,"A documentary about branding, advertising and ..."


In [183]:
for title in sample['title'].values:
    recommend(title)

**************************************************
These are the top 10 recommended movies for Bernie:
**************************************************
1 -- Oceans
2 -- Keeping Up with the Steins
3 -- Deep Rising
4 -- Guiana 1838
5 -- The Living Wake
6 -- Fabled
7 -- In the Heart of the Sea
8 -- The Life Aquatic with Steve Zissou
9 -- Dogtown and Z-Boys
**************************************************
These are the top 10 recommended movies for The Best of Me:
**************************************************
1 -- Fiza
2 -- Resurrecting the Champ
3 -- Tiger Orange
4 -- Outside Providence
5 -- Over Her Dead Body
6 -- Dumb and Dumberer: When Harry Met Lloyd
7 -- Dick
8 -- My Life Without Me
9 -- Sex Drive
**************************************************
These are the top 10 recommended movies for Tremors:
**************************************************
1 -- The Boy
2 -- Book of Shadows: Blair Witch 2
3 -- Dysfunctional Friends
4 -- Caramel
5 -- The Howling
6 -- House of Wax
7 -

In [184]:
# Let us pass a dummy name and see what it will return

recommend('Micky is programming')

You passed a movie that is not in our dataset


In [190]:
# Let us pass now a correct movie name but with spelling error

recommend('SuPeRr Man')

**************************************************
These are the top 10 recommended movies for Superman:
**************************************************
1 -- Superman II
2 -- Superman Returns
3 -- Superman IV: The Quest for Peace
4 -- Superman III
5 -- Man of Steel
6 -- Iron Man 2
7 -- Iron Man 3
8 -- Batman v Superman: Dawn of Justice
9 -- X-Men: Apocalypse


# Saving the dataframe and similarity index

In [193]:
save_movie ='../models/movie_list.pkl'
save_similarity = '../models/similairty.pkl'

with open(save_movie,'wb') as f:
    pickle.dump(final_df,f)

with open(save_similarity,'wb') as f:
    pickle.dump(similarity,f)