In [1]:
#Import Libraries
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np #linear algebra


In [2]:
# Import dataset
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(2) #show first two rows 

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
credits.head(2)#show first two rows 

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [5]:
# Merge movie and credit dataset
movie_df = movies.merge(credits, on= 'title')
movie_df.head(2) #show first two rows 

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [6]:
# Distribution of Original_language
import matplotlib.pyplot as plt
movie_df['original_language'].value_counts()

en    4510
fr      70
es      32
zh      27
de      27
hi      19
ja      16
it      14
ko      12
cn      12
ru      11
pt       9
da       7
sv       5
nl       4
fa       4
th       3
he       3
ta       2
cs       2
ro       2
id       2
ar       2
vi       1
sl       1
ps       1
no       1
ky       1
hu       1
pl       1
af       1
nb       1
tr       1
is       1
xx       1
te       1
el       1
Name: original_language, dtype: int64

Observations:
1. Budget does not create any value to our model. Viewer does not watch the movie on the basis of budget. So, we drop the feature.
2. Homepage is an irrelevent feature.
3. Original_language feature have more than 95% of value as 'English'. So, dropping the feature does not create any obstacle.
4. Original title and Ttile are almost same. But, sometime, the original title could be on native language. So, among these two, we keep only Title dropping Original title.
5. Popularity could be an important feature. But, the way we are going to create a recommender ( through tags), we do not take this feature.
6. Release date could be a factor. But, we are ignoring it this time.
7. Revenue is not important.
8. Ryn-time is also an irrelevent feature.
9. Tagline is also not relevent because we can see that movie title and tagline do not match all the time. So, we drop it.

In [7]:
# Filter relevent features
movie_final = movie_df[['id', 'genres','keywords', 'overview','production_companies','status','title','cast','crew']]
movie_final.head(2)

Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Released,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",Released,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [8]:
# Check missing values
movie_final.isna().sum() #show sum of missing values 

id                      0
genres                  0
keywords                0
overview                3
production_companies    0
status                  0
title                   0
cast                    0
crew                    0
dtype: int64

In [9]:
# Drop missing values
movie_final.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final.dropna(inplace=True)


In [10]:
# Check for duplicates 
movie_final.duplicated().sum() #show sum of duplicates values

0

In [11]:
#show only first row fo genres column
movie_final.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
import ast
# Function to extract name
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj): #ast.literal_eval() convert str into list
        L.append(i['name'])
    return L
    

In [13]:
# Apply convert function to genres
movie_final['genres'] = movie_final['genres'].apply(convert)
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['genres'] = movie_final['genres'].apply(convert)


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Released,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",Released,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [14]:
movie_final.iloc[0]['keywords']

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [15]:
movie_final['keywords'] = movie_final['keywords'].apply(convert)
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['keywords'] = movie_final['keywords'].apply(convert)


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Released,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",Released,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [16]:
movie_final.iloc[0]['overview']

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [17]:
# String to list 
movie_final['overview'] = movie_final['overview'].apply(lambda x:x.split())
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['overview'] = movie_final['overview'].apply(lambda x:x.split())


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...",Released,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",Released,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [18]:
movie_final.iloc[0]['production_companies']

'[{"name": "Ingenious Film Partners", "id": 289}, {"name": "Twentieth Century Fox Film Corporation", "id": 306}, {"name": "Dune Entertainment", "id": 444}, {"name": "Lightstorm Entertainment", "id": 574}]'

In [19]:
movie_final['production_companies']=movie_final['production_companies'].apply(convert)
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['production_companies']=movie_final['production_companies'].apply(convert)


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Ingenious Film Partners, Twentieth Century Fo...",Released,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Walt Disney Pictures, Jerry Bruckheimer Films...",Released,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [20]:
def convert_cast(obj):
    L=[]
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=3: # Limit 3 only
            L.append(i['name'])
            counter +=1
    return L

In [21]:
# Apply convert_cast function on cast column
movie_final['cast']=movie_final['cast'].apply(convert_cast)
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['cast']=movie_final['cast'].apply(convert_cast)


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Ingenious Film Partners, Twentieth Century Fo...",Released,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Walt Disney Pictures, Jerry Bruckheimer Films...",Released,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [22]:
movie_final.iloc[0]['crew']

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [23]:
#Funtion for extract Director Name from Crew Column
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [24]:
#Applay Director_Name function on Crew Column for Extract Director Name
movie_final['crew'] = movie_final['crew'].apply(fetch_director)
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['crew'] = movie_final['crew'].apply(fetch_director)


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Ingenious Film Partners, Twentieth Century Fo...",Released,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Walt Disney Pictures, Jerry Bruckheimer Films...",Released,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [25]:
movie_final.head(2)

Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[In, the, 22nd, century,, a, paraplegic, Marin...","[Ingenious Film Partners, Twentieth Century Fo...",Released,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain, Barbossa,, long, believed, to, be, d...","[Walt Disney Pictures, Jerry Bruckheimer Films...",Released,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


In [26]:
movie_final.sample(5)

Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
1813,36355,"[Adventure, Family]","[dolphin, florida, florida keys, summer]","[Sandy, Ricks, is, sent, by, his, mom, to, Cor...","[Universal Pictures, The Bubble Factory, Ameri...",Released,Flipper,"[Elijah Wood, Paul Hogan, Jonathan Banks]",[Alan Shapiro]
1498,8055,"[Drama, Romance]","[germany, war crimes, trial, female prisoner, ...","[The, story, of, Michael, Berg,, a, German, la...","[Studio Babelsberg, Filmförderanstalt (FFA), T...",Released,The Reader,"[Kate Winslet, Ralph Fiennes, David Kross]",[Stephen Daldry]
3197,29514,"[Horror, Thriller]","[cat, photographer, nightmare, hallucination, ...","[Two, horror, segments, based, on, Edgar, Alla...",[],Released,Two Evil Eyes,"[Adrienne Barbeau, Harvey Keitel, Ramy Zada]","[George A. Romero, Dario Argento]"
4743,64973,[Documentary],[],"[This, video, shows, how, the, foreign, policy...",[],Released,"Peace, Propaganda & the Promised Land",[],[Sut Jhally]
1413,10066,[Horror],"[american football, traffic jam, remake, murde...","[A, group, of, unwitting, teens, are, stranded...","[Village Roadshow Pictures, Dark Castle Entert...",Released,House of Wax,"[Elisha Cuthbert, Paris Hilton, Brian Van Holt]",[Jaume Collet-Serra]


In [27]:
# Removing spaces between words
movie_final['genres'] = movie_final['genres'].apply(lambda x :[i.replace(" ", "") for i in x])
movie_final['keywords'] = movie_final['keywords'].apply(lambda x :[i.replace(" ", "") for i in x])
movie_final['overview'] = movie_final['overview'].apply(lambda x :[i.replace(" ", "") for i in x])
movie_final['production_companies'] = movie_final['production_companies'].apply(lambda x :[i.replace(" ", "") for i in x])
movie_final['cast'] = movie_final['cast'].apply(lambda x :[i.replace(" ", "") for i in x])
movie_final['crew'] = movie_final['crew'].apply(lambda x :[i.replace(" ", "") for i in x])
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['genres'] = movie_final['genres'].apply(lambda x :[i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['keywords'] = movie_final['keywords'].apply(lambda x :[i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final

Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...",Released,Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...",Released,Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


We have removed the spaces between words. Sometime, we could get double words of same name. At that time, our model may get confused and recommed wrong movie.

In [28]:
# Create new feature named 'tags'
movie_final['tags'] =  movie_final['genres'] +  movie_final['keywords']+ movie_final['overview']+ movie_final['production_companies']+ movie_final['cast']+ movie_final['crew']
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] =  movie_final['genres'] +  movie_final['keywords']+ movie_final['overview']+ movie_final['production_companies']+ movie_final['cast']+ movie_final['crew']


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...",Released,Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, c..."
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...",Released,Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action, ocean, drugabuse,..."


In [29]:
# Convert tags column from list to string
movie_final['tags'] =movie_final['tags'].apply(lambda x:" ".join(x))
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] =movie_final['tags'].apply(lambda x:" ".join(x))


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...",Released,Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],Action Adventure Fantasy ScienceFiction cultur...
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...",Released,Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],Adventure Fantasy Action ocean drugabuse exoti...


In [30]:
# add status and titile into tags column
movie_final['tags'] =movie_final['tags'] + " " + movie_final['status'] + " " + movie_final['title']
movie_final.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_final['tags'] =movie_final['tags'] + " " + movie_final['status'] + " " + movie_final['title']


Unnamed: 0,id,genres,keywords,overview,production_companies,status,title,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[IngeniousFilmPartners, TwentiethCenturyFoxFil...",Released,Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],Action Adventure Fantasy ScienceFiction cultur...
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[WaltDisneyPictures, JerryBruckheimerFilms, Se...",Released,Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],Adventure Fantasy Action ocean drugabuse exoti...


In [31]:
movie_final.iloc[0]['tags']

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. IngeniousFilmPartners TwentiethCenturyFoxFilmCorporation DuneEntertainment LightstormEntertainment SamWorthington ZoeSaldana SigourneyWeaver JamesCameron Released Avatar'

In [32]:
# Select required features only
new_df = movie_final[['id', 'title', 'tags']]
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,Action Adventure Fantasy ScienceFiction cultur...
1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action ocean drugabuse exoti...
2,206647,Spectre,Action Adventure Crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,Action Crime Drama Thriller dccomics crimefigh...
4,49529,John Carter,Action Adventure ScienceFiction basedonnovel m...


In [33]:
# Convert to lower case
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


Unnamed: 0,id,title,tags
0,19995,Avatar,action adventure fantasy sciencefiction cultur...
1,285,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drugabuse exoti...
2,206647,Spectre,action adventure crime spy basedonnovel secret...
3,49026,The Dark Knight Rises,action crime drama thriller dccomics crimefigh...
4,49529,John Carter,action adventure sciencefiction basedonnovel m...


In [34]:
#convert Same Words into single word (love, loving, loved convert into love)
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [35]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [36]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [37]:
#Convert Tags into Vactors (2D)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [38]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [39]:
vector.shape

(4806, 5000)

In [40]:
# Find Consine Similarity (angle) between two vactors (movies)
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [41]:
similarity

array([[1.        , 0.09639254, 0.10559274, ..., 0.06375767, 0.02105847,
        0.0216574 ],
       [0.09639254, 1.        , 0.07824608, ..., 0.06299408, 0.04161252,
        0.04279605],
       [0.10559274, 0.07824608, 1.        , ..., 0.04600437, 0.02279212,
        0.02344036],
       ...,
       [0.06375767, 0.06299408, 0.04600437, ..., 1.        , 0.05504819,
        0.07548514],
       [0.02105847, 0.04161252, 0.02279212, ..., 0.05504819, 1.        ,
        0.0934947 ],
       [0.0216574 , 0.04279605, 0.02344036, ..., 0.07548514, 0.0934947 ,
        1.        ]])

In [42]:
#Fatch Index 
index= new_df[new_df['title'] == 'Ramanujan'].index[0]
index

2277

In [43]:
#Function for Recommend Movies 
def recommend(movie_name):
    movie_index= new_df[new_df['title'] == movie_name].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1]) #enumerate() function adds a counter as the key of the enumerate object.
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

In [44]:
recommend('Gandhi')

A Beautiful Mind
Son of God
Le Havre
School for Scoundrels
Hesher


In [45]:
import pickle
pickle.dump(new_df,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))