### Loading Datasets

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('datasets/movies.csv')
credits = pd.read_csv('datasets/credits.csv')

In [3]:
print(movies.shape)
movies.head(2)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
print(credits.shape)
credits.head(2)

(4803, 4)


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


### Preprocessing Datasets

In [5]:
# Merging movies and credits datasets using 'title' as common column
movies = movies.merge(credits, on='title')
print(movies.shape)

(4809, 23)


In [6]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [7]:
# Choosing specific columns for content filtering based recommendation
movies = movies[['id', 'title', 'overview', 'genres',
                 'keywords', 'popularity', 'cast', 'crew']]
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,popularity,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",150.437577,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",139.082615,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",107.376788,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",112.31295,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",43.926995,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [8]:
# Missing values check
movies.isnull().sum()

id            0
title         0
overview      3
genres        0
keywords      0
popularity    0
cast          0
crew          0
dtype: int64

In [9]:
# Remove missing(NaN) rows
movies.dropna(inplace=True)

In [10]:
# Duplicated values check
movies.duplicated().sum()

0

In [11]:
# Remove duplicated rows
movies.drop_duplicates(inplace=True)

In [12]:
# Display integer location based index of genres in movies df
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [13]:
# Feature Extraction of genres from list of dictionaries format
# ['Action','Adventure','Fantasy','Science Fiction']

import ast

def extract(obj):
    x = []
    for i in ast.literal_eval(obj):
        x.append(i['name'])
    return x

In [14]:
movies['genres'] = movies['genres'].apply(extract)
movies['genres']

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [15]:
# Performing same task for keywords
movies['keywords'] = movies['keywords'].apply(extract)
movies['keywords']

0       [culture clash, future, space war, space colon...
1       [ocean, drug abuse, exotic island, east india ...
2       [spy, based on novel, secret agent, sequel, mi...
3       [dc comics, crime fighter, terrorist, secret i...
4       [based on novel, mars, medallion, space travel...
                              ...                        
4804    [united states–mexico barrier, legs, arms, pap...
4805                                                   []
4806    [date, love at first sight, narration, investi...
4807                                                   []
4808            [obsession, camcorder, crush, dream girl]
Name: keywords, Length: 4806, dtype: object

In [16]:
movies.iloc[0].cast[:150] # [:150] displays 150 max characters

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}'

In [17]:
# Feature Extraction of cast from list of dictionaries format
def extract1(obj):
    x = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            x.append(i['name'])
            count += 1
        else:
            break
    return x

In [18]:
movies['cast'] = movies['cast'].apply(extract1)
movies['cast']

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4806, dtype: object

In [19]:
movies.iloc[0].crew[:999]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [20]:
# Feature Extraction of crew from list of dictionaries format to fetch movie_director
def extract_director(obj):
    x = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            x.append(i['name'])
    return x

In [21]:
movies['crew'] = movies['crew'].apply(extract_director)
movies['crew']

0                                [James Cameron]
1                               [Gore Verbinski]
2                                   [Sam Mendes]
3                            [Christopher Nolan]
4                               [Andrew Stanton]
                          ...                   
4804                          [Robert Rodriguez]
4805                              [Edward Burns]
4806                               [Scott Smith]
4807                               [Daniel Hsia]
4808    [Brian Herzlinger, Jon Gunn, Brett Winn]
Name: crew, Length: 4806, dtype: object

In [22]:
# Turning string values of overview column into list
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies['overview']

0       [In, the, 22nd, century,, a, paraplegic, Marin...
1       [Captain, Barbossa,, long, believed, to, be, d...
2       [A, cryptic, message, from, Bond’s, past, send...
3       [Following, the, death, of, District, Attorney...
4       [John, Carter, is, a, war-weary,, former, mili...
                              ...                        
4804    [El, Mariachi, just, wants, to, play, his, gui...
4805    [A, newlywed, couple's, honeymoon, is, upended...
4806    ["Signed,, Sealed,, Delivered", introduces, a,...
4807    [When, ambitious, New, York, attorney, Sam, is...
4808    [Ever, since, the, second, grade, when, he, fi...
Name: overview, Length: 4806, dtype: object

In [23]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,popularity,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",150.437577,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",139.082615,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",107.376788,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",112.31295,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",43.926995,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [24]:
# Replace blankspace with nospace
movies['genres'] = movies['genres'].apply(
    lambda x: [i.replace(' ', '') for i in x])

movies['keywords'] = movies['keywords'].apply(
    lambda x: [i.replace(' ', '') for i in x])

movies['crew'] = movies['crew'].apply(
    lambda x: [i.replace(' ', '') for i in x])

movies['cast'] = movies['cast'].apply(
    lambda x: [i.replace(' ', '') for i in x])

In [25]:
movies.sample(3)

Unnamed: 0,id,title,overview,genres,keywords,popularity,cast,crew
4449,15708,Latter Days,"[Aaron, Davis, (Steve, Sandvoss), and, Christi...","[Drama, Comedy, Romance]","[gay, comingout, religion, mormon]",5.424916,"[SteveSandvoss, WesRamsey, JacquelineBisset]",[C.JayCox]
601,11535,Rollerball,"[From, the, director, of, Die, Hard, comes, th...","[Action, ScienceFiction, Thriller]","[manager, arena, dystopia, wrestling, sport, r...",9.306253,"[ChrisKlein, LLCoolJ, RebeccaRomijn]",[JohnMcTiernan]
3344,19913,(500) Days of Summer,"[Tom, (Joseph, Gordon-Levitt),, greeting-card,...","[Comedy, Drama, Romance]","[date, sex, jealousy, fight, architect, galler...",45.610993,"[JosephGordon-Levitt, ZooeyDeschanel, ChloëGra...",[MarcWebb]


In [26]:
# Concatenate multiple columns into one column'tags'
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['crew'] + movies['cast']

In [27]:
movies.sample(3)

Unnamed: 0,id,title,overview,genres,keywords,popularity,cast,crew,tags
3191,14057,Trust the Man,"[Overachieving, actress,, Rebecca, (Moore),, m...","[Comedy, Drama, Romance]",[independentfilm],5.2545,"[DavidDuchovny, JulianneMoore, BillyCrudup]",[BartFreundlich],"[Overachieving, actress,, Rebecca, (Moore),, m..."
796,347969,The Ridiculous 6,"[When, his, long-lost, outlaw, father, returns...","[Comedy, Western]",[wildwest],19.694695,"[AdamSandler, TaylorLautner, SteveBuscemi]",[FrankCoraci],"[When, his, long-lost, outlaw, father, returns..."
784,2312,In the Name of the King: A Dungeon Siege Tale,"[A, man, named, Farmer, sets, out, to, rescue,...","[Adventure, Fantasy, Action, Drama]","[fictionalplace, monster, lossoffamily, newlov...",15.406173,"[JasonStatham, JohnRhys-Davies, RayLiotta]",[UweBoll],"[A, man, named, Farmer, sets, out, to, rescue,..."


In [28]:
new_df = movies[['id','title','tags']]
new_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [29]:
# Turning list values of tags column into string
new_df['tags'] = new_df['tags'].apply(lambda x:' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:' '.join(x))


In [30]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d JamesCameron SamWorthington ZoeSaldana SigourneyWeaver'

In [31]:
# Lowercase conversion of letters in tags
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [32]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [33]:
new_df.tags[0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d jamescameron samworthington zoesaldana sigourneyweaver'

### Vectorization

In [34]:
# Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
cv.get_feature_names_out()[:100]  #[:100] displays 100 max characters

array(['000', '007', '10', '100', '11', '12', '13', '14', '15', '16',
       '17', '18', '18th', '19', '1930s', '1940s', '1944', '1950',
       '1950s', '1960s', '1970s', '1971', '1974', '1976', '1980', '1980s',
       '1985', '1990s', '1999', '19th', '19thcentury', '20', '200',
       '2003', '2009', '20th', '24', '25', '30', '300', '3d', '40', '50',
       '500', '60', '60s', '70', 'aaron', 'aaroneckhart', 'abandoned',
       'abducted', 'abigailbreslin', 'abilities', 'ability', 'able',
       'aboard', 'abuse', 'abusive', 'academic', 'academy', 'accept',
       'accepted', 'accepts', 'access', 'accident', 'accidental',
       'accidentally', 'accompanied', 'accomplish', 'account',
       'accountant', 'accused', 'ace', 'achieve', 'act', 'acting',
       'action', 'actionhero', 'actions', 'activist', 'activities',
       'activity', 'actor', 'actors', 'actress', 'acts', 'actual',
       'actually', 'adam', 'adams', 'adamsandler', 'adamshankman',
       'adaptation', 'adapted', 'addic

### Stemmer
<p><ul>A text processing technique that involves reducing words to their base or root form.</br><i>Example: ['accept', 'accepted', 'accepts'] ---Stemming---> ['accept', 'accept', 'accept']</i></ul></p>

In [37]:
# Documentaion: https://www.nltk.org/howto/stem.html
import nltk
from nltk.stem.porter import PorterStemmer

In [38]:
ps = PorterStemmer()

In [39]:
# function 'stem' takes a text input and applies stemming
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [40]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [41]:
new_df['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d jamescameron samworthington zoesaldana sigourneyweav'

In [42]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
cv.get_feature_names_out()[50:100]

array(['500', '60', '70', '80', 'aaron', 'aaroneckhart', 'abandon',
       'abduct', 'abigailbreslin', 'abil', 'abl', 'aboard', 'abov',
       'abus', 'academ', 'academi', 'accept', 'access', 'accid',
       'accident', 'acclaim', 'accompani', 'accomplish', 'account',
       'accus', 'ace', 'achiev', 'acquaint', 'act', 'action',
       'actionhero', 'activ', 'activist', 'activities', 'actor',
       'actress', 'actual', 'ad', 'adam', 'adamsandl', 'adamshankman',
       'adapt', 'add', 'addict', 'adjust', 'admir', 'admit', 'adolesc',
       'adopt', 'ador'], dtype=object)

## Content Filtering
### Cosine Similarity

In [43]:
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
similarity = cosine_similarity(vectors)

print('Size:',similarity.shape)
similarity

Size: (4806, 4806)


array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [45]:
# calling enumerate fn to save indexing of similarity between 0-4806 movies
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[:5]

[(0, 1.0000000000000002),
 (1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383)]

In [46]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distance = similarity[movie_index]
    movies_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x:x[1])[1:5]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [47]:
recommend('The Dark Knight Rises')

The Dark Knight
Batman Returns
Batman
Batman Forever


In [48]:
new_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [49]:
import pickle

pickle.dump(new_df,open('movies.pkl', 'wb'))
pickle.dump(new_df.to_dict(),open('movies_dict.pkl', 'wb'))
pickle.dump(similarity,open('similarity.pkl', 'wb'))