# Importing libraries

In [86]:
import numpy as np
import pandas as pd
import ast
import difflib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Collection and Processing

In [2]:
#loading the data from file
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
#printing the first row of movies table
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
#printing the first row of credits table
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
#merging the 2 tables
movies = movies.merge(credits, on ='title')

In [6]:
#printing the first row of the new table
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
#Selecting relevant features for the model 
new_features = ['movie_id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']
movies = movies[['movie_id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']]
movies.head(1)

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
#finding null values
movies.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    3
cast        0
crew        0
dtype: int64

In [9]:
#eradicating the rows containing null values
movies.dropna(inplace=True)
movies.isnull().sum()

movie_id    0
title       0
genres      0
keywords    0
overview    0
cast        0
crew        0
dtype: int64

In [10]:
movies.duplicated().sum()

0

In [11]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

# Converting the list of dictionaries to that of strings in each coloumn(features)

In [12]:
def convert(obj):
    List = []
    for i in ast.literal_eval(obj):
        List.append(i['name'])
    return List
    


In [13]:
movies['genres'] = movies['genres'].apply(convert)

In [14]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [15]:
movies['keywords'] = movies['keywords'].apply(convert)

In [16]:
# function to select the 3 most imp. cast in a movie
def conv1(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
           L.append(i['name'])
           counter+=1
        else:
            break
    return L

In [17]:
movies['cast'] = movies['cast'].apply(conv1)

In [18]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [19]:
movies['crew'][0]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [20]:
#function to select only director's name from the crew feature
def director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [21]:
movies['crew'] = movies['crew'].apply(director)

In [22]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [23]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [24]:
#removing space from strings in each coloumn
movies['genres'] = movies['genres'].apply(lambda n:[i.replace(" ","") for i in n])
movies['cast'] = movies['cast'].apply(lambda n:[i.replace(" ","") for i in n])
movies['crew'] = movies['crew'].apply(lambda n:[i.replace(" ","") for i in n])
movies['keywords'] = movies['keywords'].apply(lambda n:[i.replace(" ","") for i in n])

In [25]:
movies.head()

Unnamed: 0,movie_id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [26]:
#combining all the important features to one coloumn
movies['tags'] = movies['genres'] + movies['overview'] + movies['cast'] + movies['crew'] + movies['keywords']
new_data = movies[['movie_id', 'title', 'tags']]
new_data

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, I..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, Captain, Barbossa..."
2,206647,Spectre,"[Action, Adventure, Crime, A, cryptic, message..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, Following, th..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction, John, Cart..."
...,...,...,...
4804,9367,El Mariachi,"[Action, Crime, Thriller, El, Mariachi, just, ..."
4805,72766,Newlyweds,"[Comedy, Romance, A, newlywed, couple's, honey..."
4806,231617,"Signed, Sealed, Delivered","[Comedy, Drama, Romance, TVMovie, ""Signed,, Se..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [27]:
#converting the list to string 
new_data['tags'] = new_data['tags'].apply(lambda n:" ".join(n))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tags'] = new_data['tags'].apply(lambda n:" ".join(n))


In [28]:
new_data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,Action Adventure Fantasy ScienceFiction In the...
1,285,Pirates of the Caribbean: At World's End,"Adventure Fantasy Action Captain Barbossa, lon..."
2,206647,Spectre,Action Adventure Crime A cryptic message from ...
3,49026,The Dark Knight Rises,Action Crime Drama Thriller Following the deat...
4,49529,John Carter,Action Adventure ScienceFiction John Carter is...


In [53]:
new_data['tags'][0]

'Action Adventure Fantasy ScienceFiction In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver JamesCameron cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d'

In [30]:
#converting the test data into vectors
vectorizer = CountVectorizer(max_features = 5000, stop_words = 'english')
vect = vectorizer.fit_transform(new_data['tags']).toarray()
vect[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:
vectorizer.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1940s',
 '1950',
 '1950s',
 '1960s',
 '1970s',
 '1980',
 '1980s',
 '1985',
 '1990s',
 '1999',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2009',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '60s',
 '70',
 '70s',
 'aaron',
 'aaroneckhart',
 'abandoned',
 'abducted',
 'abigailbreslin',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accomplish',
 'account',
 'accountant',
 'accused',
 'ace',
 'achieve',
 'act',
 'acting',
 'action',
 'actionhero',
 'actions',
 'activist',
 'activities',
 'activity',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adams',
 'adamsandler',
 'adamshankman',
 'adaptation',
 'adapted',
 'addict',
 'addicted',
 'addiction',
 'adolescence',
 'adolescent'

# Applying Cosine Similarity Algorithm

In [136]:
movie_name = input(" Enter the movie name : ")

 Enter the movie name : iron man


In [137]:
#creating a list of all movies present in dataset
titles_list = new_data['title'].tolist()
titles_list

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [156]:
search_closest_movie = difflib.get_close_matches(movie_name, titles_list)
search_closest_movie

['Shutter Island', 'The Island', 'Cutthroat Island']

In [154]:
closest_movie = search_closest_movie[0]
close_match

'Batman'

In [140]:
#Finding the index of the given movie
movie_index = new_data[new_data['title'] == close_match].index[0]
movie_index

68

In [141]:
#getting similarity scores matrix
similar = cosine_similarity(vect)
similar

array([[1.        , 0.08964215, 0.05976143, ..., 0.02519763, 0.02817181,
        0.        ],
       [0.08964215, 1.        , 0.0625    , ..., 0.02635231, 0.        ,
        0.        ],
       [0.05976143, 0.0625    , 1.        , ..., 0.02635231, 0.        ,
        0.        ],
       ...,
       [0.02519763, 0.02635231, 0.02635231, ..., 1.        , 0.0745356 ,
        0.04836508],
       [0.02817181, 0.        , 0.        , ..., 0.0745356 , 1.        ,
        0.05407381],
       [0.        , 0.        , 0.        , ..., 0.04836508, 0.05407381,
        1.        ]])

In [142]:
#getting list of simmilar movies 
similarity_score = list(enumerate(similar[movie_index]))
similarity_score

[(0, 0.13522468075656266),
 (1, 0.10606601717798213),
 (2, 0.07071067811865475),
 (3, 0.047140452079103175),
 (4, 0.09258200997725516),
 (5, 0.11202240672224079),
 (6, 0.026726124191242442),
 (7, 0.38758511609996354),
 (8, 0.03651483716701107),
 (9, 0.12060453783110546),
 (10, 0.17407765595569785),
 (11, 0.06405126152203486),
 (12, 0.11338934190276817),
 (13, 0.05601120336112039),
 (14, 0.16903085094570333),
 (15, 0.06030226891555273),
 (16, 0.2919985580353726),
 (17, 0.10160010160015241),
 (18, 0.053452248382484885),
 (19, 0.051639777949432225),
 (20, 0.06882472016116853),
 (21, 0.07559289460184544),
 (22, 0.04170288281141496),
 (23, 0.07559289460184544),
 (24, 0.06666666666666667),
 (25, 0.0),
 (26, 0.304255531702266),
 (27, 0.140028008402801),
 (28, 0.11141720290623112),
 (29, 0.05601120336112039),
 (30, 0.17320508075688776),
 (31, 0.42339019740572564),
 (32, 0.026967994498529685),
 (33, 0.24494897427831788),
 (34, 0.04082482904638631),
 (35, 0.12247448713915893),
 (36, 0.0937042571

In [143]:
# sorting the movies on the bais of smilarity score
sorted(similarity_score, reverse = True, key = lambda n:n[1])[1:6]

[(79, 0.46852128566581824),
 (31, 0.42339019740572564),
 (7, 0.38758511609996354),
 (26, 0.304255531702266),
 (16, 0.2919985580353726)]

In [152]:
# function to print the name of recommended movies
def recommend(movie_name):
    distances = similar[movie_index]
    list_of_movies = sorted(similarity_score, reverse = True, key = lambda n:n[1])[1:6]
    for i in list_of_movies:
        print(new_data.iloc[i[0]].title)
    

In [153]:
# printing the name of recommended movies
recommend(movie_name)

Gone Girl
Miracle at St. Anna
Black Book
Catch-22
The Girl with the Dragon Tattoo


# Movie Recommender System

In [157]:
movie_name = input(" Enter the movie name : ")
titles_list = new_data['title'].tolist()
search_closest_movie = difflib.get_close_matches(movie_name, titles_list)
closest_movie = search_closest_movie[0]
movie_index = new_data[new_data['title'] == closest_movie].index[0]
similar = cosine_similarity(vect)
similarity_score = list(enumerate(similar[movie_index]))

def recommend(movie_name):
    distances = similar[movie_index]
    list_of_movies = sorted(similarity_score, reverse = True, key = lambda n:n[1])[1:11]
    for i in list_of_movies:
        print(new_data.iloc[i[0]].title)

recommend(movie_name)      

 Enter the movie name : krish
Girl 6
Harrison Montgomery
My Lucky Star
Inside Deep Throat
Incident at Loch Ness
The Stewardesses
A Beginner's Guide to Snuff
Smilla's Sense of Snow
Roll Bounce
Movie 43
