# Basic Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
!unzip -qq credits.zip

In [3]:
# Reading all the csv files as Data Frames
links = pd.read_csv('links.csv')
movies_metadata = pd.read_csv('movies_metadata.csv')
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')

# Pre Processing Data

Here we analyse each file, drop null and duplicated values, merge dataframes and streamline columns.

## Links

In [4]:
links.info()  # tmdbId - 219 values null (movies)
links.isnull().sum()
# We drop the entries for which tmdb values are missing
links = links.dropna(subset=['tmdbId'])
convertDict = {'tmdbId' : int}
links = links.astype(convertDict)
links.rename(columns={'tmdbId': 'id'}, inplace=True)
links

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  45843 non-null  int64  
 1   imdbId   45843 non-null  int64  
 2   tmdbId   45624 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.0 MB


Unnamed: 0,movieId,imdbId,id
0,1,114709,862
1,2,113497,8844
2,3,113228,15602
3,4,114885,31357
4,5,113041,11862
...,...,...,...
45838,176269,6209470,439050
45839,176271,2028550,111109
45840,176273,303758,67758
45841,176275,8536,227506


## Credits and Keywords

In [5]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [6]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


Merge Credits, links and keywords dataframe on the basis of id

In [7]:
credits_df = credits.merge(keywords, on = 'id')

In [8]:
credits_df = credits_df.merge(links, on = 'id')

In [9]:
credits_df

Unnamed: 0,cast,crew,id,keywords,movieId,imdbId
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1,114709
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1...",2,113497
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",3,113228
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,"[{'id': 818, 'name': 'based on novel'}, {'id':...",4,114885
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",5,113041
...,...,...,...,...,...,...
46623,"[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",439050,"[{'id': 10703, 'name': 'tragic love'}]",176269,6209470
46624,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",111109,"[{'id': 2679, 'name': 'artist'}, {'id': 14531,...",176271,2028550
46625,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",67758,[],176273,303758
46626,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",227506,[],176275,8536


## Movies Meta dataset
Main movies database

In [10]:
movies_metadata

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,...,,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,...,2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",...,2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",...,1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [11]:
movies_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

1. Removing "tt0" and "tt" from the imdb id's
2. Dropping non important columns

For content based recommender we keep only these features
*   Genres
*   id
* overview
* title

We drop some outliers here

In [12]:
movies_metadata['imdb_id'] = movies_metadata['imdb_id'].str.replace("tt0", '')
movies_metadata['imdb_id'] = movies_metadata['imdb_id'].str.replace("tt", '')

Most of the movies are already released as we can see from the value_counts.

In [13]:
movies_metadata.status.value_counts()

status
Released           45014
Rumored              230
Post Production       98
In Production         20
Planned               15
Canceled               2
Name: count, dtype: int64

Deleting the non Released entries from the database

In [14]:
indices = list(movies_metadata[movies_metadata.status != 'Released'].index)
movies_metadata = movies_metadata.drop(index = indices)

In [15]:
movies_metadata[movies_metadata.adult == 'True']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19489,True,,0,"[{'id': 27, 'name': 'Horror'}]",,5422,79642,it,Le notti erotiche dei morti viventi,A sailor takes an American businessman and his...,...,1980-11-18,0.0,112.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,,Erotic Nights of the Living Dead,False,2.2,7.0
28701,True,,0,"[{'id': 80, 'name': 'Crime'}]",http://www.imdb.com/title/tt0119931/,169158,119931,en,Standoff,The FBI and Texas police make ready to storm t...,...,1998-01-11,0.0,91.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Trust Will Get You Killed,Standoff,False,4.8,5.0
31934,True,,0,"[{'id': 35, 'name': 'Comedy'}]",,44781,322232,cn,發電悄嬌娃,Electrical Girl centers around a horny young w...,...,2001-04-26,0.0,89.0,"[{'iso_639_1': 'cn', 'name': '广州话 / 廣州話'}]",Released,,Electrical Girl,False,0.0,0.0
32113,True,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",http://www.dietofsex.com/,324230,3094816,es,Diet of Sex,Ágata suffers from a psychological disorder wh...,...,2014-02-14,0.0,72.0,"[{'iso_639_1': 'es', 'name': 'Español'}]",Released,"Comedy, food, drama and sex, a lot of sex",Diet of Sex,False,4.0,12.0
39901,True,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 27, 'name...",,35731,1161951,en,Amateur Porn Star Killer 2,Shane Ryan's sequel to the disturbing Amateur ...,...,2008-05-13,0.0,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Humiliation. Rape. Murder. You know the drill.,Amateur Porn Star Killer 2,False,6.3,8.0
39902,True,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,55774,1153101,en,The Band,Australian film about a fictional sub-par Aust...,...,2009-11-17,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Sex, drugs and Rock 'n Roll",The Band,False,3.3,7.0
40574,True,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 27, 'name...",,39922,70696,da,Dværgen,Olaf and his mother run a boarding house and a...,...,1973-12-20,0.0,92.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Mother of all Dwarfsploitation films!,The Sinful Dwarf,False,5.8,13.0
41009,True,,750000,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",,376004,4044464,en,Adulterers,A man who returns home to find his wife cheati...,...,2016-01-05,0.0,80.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Love. Betrayal. Vengeance,Adulterers,False,5.2,16.0


Since only 8 movies have adult rating, we can drop those movies from the dataset, and then drop the adult column, because now its redundant.

In [16]:
indices = list(movies_metadata[movies_metadata.adult == 'True'].index)
movies_metadata = movies_metadata.drop(index = indices)

In [17]:
movie_df = movies_metadata[['id', 'title', 'overview', 'genres']]
convertDict = {'id' : int}
movie_df = movie_df.astype(convertDict)
movie_df = movie_df.merge(credits_df, on = 'id')

In [18]:
movie_df

Unnamed: 0,id,title,overview,genres,cast,crew,keywords,movieId,imdbId
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1,114709
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",2,113497
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",3,113228
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",4,114885
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",5,113041
...,...,...,...,...,...,...,...,...,...
46425,439050,Subdue,Rising and falling between a man and woman.,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...","[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...","[{'id': 10703, 'name': 'tragic love'}]",176269,6209470
46426,111109,Century of Birthing,An artist struggles to finish his work while a...,"[{'id': 18, 'name': 'Drama'}]","[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...","[{'id': 2679, 'name': 'artist'}, {'id': 14531,...",176271,2028550
46427,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",[],176273,303758
46428,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",[],"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",[],176275,8536


In [19]:
movie_df.isnull().sum()

id            0
title         0
overview    960
genres        0
cast          0
crew          0
keywords      0
movieId       0
imdbId        0
dtype: int64

In [20]:
movie_df.dropna(inplace = True) # 960 movies with no overview dropped

In [21]:
movie_df.duplicated().sum() # 1371 rows dupicated

1371

In [22]:
movie_df.drop_duplicates(inplace = True)

# Pre Processing Final Dataset

In [23]:
movie_df.iloc[0].genres

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

'[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]'
<br>We have to convert it to this from -> ['Animation', 'Comedy', 'Family']

For that we can define a convert function, which loops through each element (dictionary) in the list and returns the name element of each dictionary. The problem is, each input is a string of list, which we resolve usinf ast.literal_eval()

In [24]:
import ast
ast.literal_eval("[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]")

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [25]:
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [26]:
convert(movie_df.iloc[0].genres)

['Animation', 'Comedy', 'Family']

In [27]:
movie_df['genres'] = movie_df['genres'].astype(str)
movie_df.genres = movie_df.genres.apply(convert)

In [28]:
convert(movie_df.iloc[0].keywords)

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

In [29]:
movie_df.keywords = movie_df.keywords.astype(str)
movie_df.keywords = movie_df.keywords.apply(convert)

In [30]:
movie_df

Unnamed: 0,id,title,overview,genres,cast,crew,keywords,movieId,imdbId
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousy, toy, boy, friendship, friends, riva...",1,114709
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[board game, disappearance, based on children'...",2,113497
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fishing, best friend, duringcreditsstinger, o...",3,113228
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[based on novel, interracial relationship, sin...",4,114885
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[baby, midlife crisis, confidence, aging, daug...",5,113041
...,...,...,...,...,...,...,...,...,...
46425,439050,Subdue,Rising and falling between a man and woman.,"[Drama, Family]","[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de...",[tragic love],176269,6209470
46426,111109,Century of Birthing,An artist struggles to finish his work while a...,[Drama],"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...","[artist, play, pinoy]",176271,2028550
46427,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[Action, Drama, Thriller]","[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",[],176273,303758
46428,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",[],"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",[],176275,8536


In [31]:
movie_df.iloc[0].cast

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

Extracting the names of the top 3 actors

In [40]:
def convert3(obj):
  L = []
  count = 0
  for i in ast.literal_eval(obj):
    if count < 3:
      L.append(i['name'])
      count += 1
    else:
      break
  return L

In [41]:
convert3(movie_df.iloc[0].cast)

ValueError: malformed node or string: ['Tom Hanks', 'Tim Allen', 'Don Rickles']

In [None]:
movie_df.cast = movie_df.cast.apply(convert3)

Extract the name of the Director

In [42]:
def convertCrew(obj):
  L = []
  for i in ast.literal_eval(obj):
    if i['job'] == 'Director':
      L.append(i['name'])
      break
  return L

In [43]:
convertCrew(movie_df.iloc[0].crew)

ValueError: malformed node or string: ['John Lasseter']

In [44]:
movie_df.crew = movie_df.crew.apply(convertCrew)

ValueError: malformed node or string: ['John Lasseter']

In [45]:
movie_df

Unnamed: 0,id,title,overview,genres,cast,crew,keywords,movieId,imdbId
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],"[jealousy, toy, boy, friendship, friends, riva...",1,114709
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],"[board game, disappearance, based on children'...",2,113497
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],"[fishing, best friend, duringcreditsstinger, o...",3,113228
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],"[based on novel, interracial relationship, sin...",4,114885
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],"[baby, midlife crisis, confidence, aging, daug...",5,113041
...,...,...,...,...,...,...,...,...,...
46425,439050,Subdue,Rising and falling between a man and woman.,"[Drama, Family]","[Leila Hatami, Kourosh Tahami, Elham Korda]",[Hamid Nematollah],[tragic love],176269,6209470
46426,111109,Century of Birthing,An artist struggles to finish his work while a...,[Drama],"[Angel Aquino, Perry Dizon, Hazel Orencio]",[Lav Diaz],"[artist, play, pinoy]",176271,2028550
46427,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[Action, Drama, Thriller]","[Erika Eleniak, Adam Baldwin, Julie du Page]",[Mark L. Lester],[],176273,303758
46428,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",[],"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...",[Yakov Protazanov],[],176275,8536


In [46]:
movie_df.iloc[0].overview

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [47]:
movie_df.overview = movie_df.overview.apply(lambda x: x.split())

In [48]:
movie_df

Unnamed: 0,id,title,overview,genres,cast,crew,keywords,movieId,imdbId
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter],"[jealousy, toy, boy, friendship, friends, riva...",1,114709
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[Joe Johnston],"[board game, disappearance, based on children'...",2,113497
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch],"[fishing, best friend, duringcreditsstinger, o...",3,113228
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker],"[based on novel, interracial relationship, sin...",4,114885
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[Steve Martin, Diane Keaton, Martin Short]",[Charles Shyer],"[baby, midlife crisis, confidence, aging, daug...",5,113041
...,...,...,...,...,...,...,...,...,...
46425,439050,Subdue,"[Rising, and, falling, between, a, man, and, w...","[Drama, Family]","[Leila Hatami, Kourosh Tahami, Elham Korda]",[Hamid Nematollah],[tragic love],176269,6209470
46426,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,...",[Drama],"[Angel Aquino, Perry Dizon, Hazel Orencio]",[Lav Diaz],"[artist, play, pinoy]",176271,2028550
46427,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr...","[Action, Drama, Thriller]","[Erika Eleniak, Adam Baldwin, Julie du Page]",[Mark L. Lester],[],176273,303758
46428,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one...",[],"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...",[Yakov Protazanov],[],176275,8536


In [49]:
movie_df.genres = movie_df.genres.apply(lambda x: [i.replace(" ", "_") for i in x])
movie_df.keywords = movie_df.keywords.apply(lambda x: [i.replace(" ", "_") for i in x])
movie_df.cast = movie_df.cast.apply(lambda x: [i.replace(" ", "_") for i in x])
movie_df.crew = movie_df.crew.apply(lambda x: [i.replace(" ", "_") for i in x])

In [50]:
movie_df["tags"] = movie_df.overview + movie_df.genres + movie_df.keywords + movie_df.cast + movie_df.crew

# Movie Database - With Tags

In [51]:
movie_df = movie_df[['id', 'title', 'tags']]

In [52]:
movie_df

Unnamed: 0,id,title,tags
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a..."
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ..."
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ..."
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr..."
...,...,...,...
46425,439050,Subdue,"[Rising, and, falling, between, a, man, and, w..."
46426,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,..."
46427,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr..."
46428,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one..."


In [53]:
movie_df.drop_duplicates(subset = 'title', inplace = True)

In [54]:
movie_df

Unnamed: 0,id,title,tags
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a..."
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ..."
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ..."
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr..."
...,...,...,...
46423,222848,Caged Heat 3000,"[It's, the, year, 3000, AD., The, world's, mos..."
46425,439050,Subdue,"[Rising, and, falling, between, a, man, and, w..."
46426,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,..."
46428,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one..."


In [55]:
movie_df.tags = movie_df.tags.apply(lambda x: " ".join(x))

In [56]:
# Convert the tags to lowercase - Recommended practice
movie_df.tags = movie_df.tags.apply(lambda x: x.lower())

In [57]:
movie_df

Unnamed: 0,id,title,tags
0,862,Toy Story,"led by woody, andy's toys live happily in his ..."
1,8844,Jumanji,when siblings judy and peter discover an encha...
2,15602,Grumpier Old Men,a family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,just when george banks has recovered from his ...
...,...,...,...
46423,222848,Caged Heat 3000,it's the year 3000 ad. the world's most danger...
46425,439050,Subdue,rising and falling between a man and woman. dr...
46426,111109,Century of Birthing,an artist struggles to finish his work while a...
46428,227506,Satan Triumphant,"in a small town live two brothers, one a minis..."


# Stemming the tags

In [58]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [59]:
ps.stem('loving'), ps.stem('love'), ps.stem('loved')

('love', 'love', 'love')

In [60]:
def stemmer(text):
  t = []
  for i in text.split():
    t.append(ps.stem(i))
  res = " ".join(t)
  return res

In [61]:
movie_df.tags = movie_df.tags.apply(stemmer)

# Text Vectorization
We convert text to a vector using the concept of Bag of Words
1. Concatenate all the tags into a single string
2. Take out the top 5000 (any number) words in this string on the basis of frequency
3. Now calculate the frequency of each of these words in each movie. Thus, you get a vector.
4. Now we recommned the 5 closest vectors to a particular vector

For vectorization we donot consider stop words like are, from, an, to ,in ,the etc.

In [62]:
m1, m2 = movie_df[:20000], movie_df[20000:]
m1

Unnamed: 0,id,title,tags
0,862,Toy Story,"led by woody, andy' toy live happili in hi roo..."
1,8844,Jumanji,when sibl judi and peter discov an enchant boa...
2,15602,Grumpier Old Men,a famili wed reignit the ancient feud between ...
3,31357,Waiting to Exhale,"cheat on, mistreat and step on, the women are ..."
4,11862,Father of the Bride Part II,just when georg bank ha recov from hi daughter...
...,...,...,...
21585,197602,Death Is a Woman,narcot agent denni goe undercov in the mediter...
21586,182127,Ip Man: The Final Fight,"in postwar hong kong, legendari wing chun gran..."
21587,199420,Pawn Shop Chronicles,"the stori of a miss wife, a coupl of meth head..."
21588,77663,Killing Season,"two veteran of the bosnian war, one american, ..."


In [63]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words = 'english')

In [64]:
cv

In [65]:
from sklearn.metrics.pairwise import cosine_similarity

# Recommendation Function

In [66]:
movie = input("Enter the name of the movie you recently watched: ")
index = movie_df[movie_df.title == movie].index[0]
if(index > 20000):
  df = m2
else:
  df = m1
print(index)

Enter the name of the movie you recently watched: Avatar
14882


In [67]:
v = cv.fit_transform(df['tags']).toarray()

In [68]:
similarity = cosine_similarity(v)

In [69]:
movies_list = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x: x[1])[1:11]
print(f"Since you watched {movie}, you may also like:-")
for i in movies_list:
  print(movie_df.iloc[i[0]].title)

Since you watched Avatar, you may also like:-
Shakedown
Disorganized Crime
Showtime
Code of Silence
My Lucky Stars
48 Hrs.
The Thieves
Flash Point
PTU
Mad Detective
