## Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Import required files

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

#### Exploring the imported files

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
movies.shape

(4803, 20)

In [5]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Merge the two files into one

In [6]:
movies = movies.merge(credits,on='title') #joining the two databases on the title column

In [7]:
movies.head(1)
# budget
# homepage
# id
# original_language
# original_title
# popularity
# production_comapny
# production_countries
# release-date

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Selecting the necessary columns from to be used for processing

In [8]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [9]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


## Function to convert the "genres" and "keywords" column values to a more suitable format

In [10]:
import ast

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

movies.dropna(inplace=True) # drops tuples with non applicable values present 
                            #in any column

movies['genres'] = movies['genres'].apply(convert)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [11]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [12]:
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

## Function to find the names of first three cast members

In [13]:
def convert3(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
        counter+=1
    return L 

In [14]:
movies['cast'] = movies['cast'].apply(convert3)
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [15]:
movies['cast'][0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

## Function to find the name of the Director from the list of crew members

In [16]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

movies['crew'] = movies['crew'].apply(fetch_director)

In [17]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]


In [18]:
movies.sample(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
3983,325173,Close Range,A rogue soldier turned outlaw is thrust into a...,"[Crime, Action]",[],"[Scott Adkins, Nick Chinlund, Caitlin Keats]",[Isaac Florentine]
973,1255,The Host,A parasitic alien soul is injected into the bo...,"[Action, Adventure, Romance, Science Fiction, ...","[based on novel, mass murder, dystopia, genoci...","[Song Kang-ho, Park Hae-il, Bae Doona]",[Bong Joon-ho]
2665,137,Groundhog Day,"A narcissistic TV weatherman, along with his a...","[Romance, Fantasy, Drama, Comedy]","[deja vu, groundhog, weather forecast, telecas...","[Bill Murray, Andie MacDowell, Chris Elliott]",[Harold Ramis]
3842,310131,The Witch,New England in the 1630s: William and Katherin...,"[Mystery, Horror]","[witch, new england, 17th century]","[Anya Taylor-Joy, Ralph Ineson, Kate Dickie]",[Robert Eggers]
3941,549,Basquiat,Director Julian Schnabel illustrates the portr...,"[Drama, History]","[new york, sex, drug abuse, homeless person, n...","[Jeffrey Wright, David Bowie, Dennis Hopper]",[Julian Schnabel]


## Function to remove in-between spaces from certain columns

In [19]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [20]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)


In [21]:
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]


## Splitting values in "overview" column into list of words

In [22]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['overview'].sample(5)

1163    [Lorraine, and, Ed, Warren, travel, to, north,...
4357    [The, relationship, between, Sergeant, Stryker...
3888    [A, young, man, is, rocked, by, two, announcem...
438     [Harry, Sanborn, is, an, aged, music, industry...
3986    [Some, of, the, world's, most, innovative, doc...
Name: overview, dtype: object

## Creating a "tag" column using specific existing columns

In [23]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'].sample(5)

414     [The, Mystery, Inc., gang, have, gone, their, ...
2488    [Trying, to, rescue, her, home, planet, from, ...
151     [6th-century, Scandinavian, warrior,, Beowulf,...
2631    [A, petty, thief, posing, as, an, actor, is, b...
3609    [Officially,, Apollo, 17, was, the, last, mann...
Name: tags, dtype: object

## Creating a new table using suitable columns

In [24]:
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


## Changing the "tags" column into paragraph

In [25]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))
new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


## Creating the ML model

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')    

In [27]:
vector = cv.fit_transform(new['tags']).toarray()

In [28]:
vector.shape

(4806, 5000)

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
similarity = cosine_similarity(vector)

In [31]:
similarity

array([[1.        , 0.0860309 , 0.05735393, ..., 0.0244558 , 0.0270369 ,
        0.        ],
       [0.0860309 , 1.        , 0.0625    , ..., 0.02665009, 0.        ,
        0.        ],
       [0.05735393, 0.0625    , 1.        , ..., 0.02665009, 0.        ,
        0.        ],
       ...,
       [0.0244558 , 0.02665009, 0.02665009, ..., 1.        , 0.07537784,
        0.0489116 ],
       [0.0270369 , 0.        , 0.        , ..., 0.07537784, 1.        ,
        0.05407381],
       [0.        , 0.        , 0.        , ..., 0.0489116 , 0.05407381,
        1.        ]])

In [32]:
new[new['title'] == 'Batman'].index[0]

1362

## Function to recommend movies in response to the user input

In [33]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)
        

In [48]:
x=input("Enter Movie Name : ")

Enter Movie Name :  The Avengers


## Calling the "recommend" function

In [49]:
recommend(x)

Avengers: Age of Ultron
Captain America: Civil War
Iron Man 3
Captain America: The First Avenger
Iron Man
