In [1]:
import pandas as pd
movies_df = pd.read_csv('movies.csv')

In [2]:
from ast import literal_eval
import numpy as np
features = ['cast','crew','keywords','genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_eval)
    
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x,list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

movies_df['director'] = movies_df['crew'].apply(get_director)
features = ['cast','keywords','genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(get_list)

In [3]:
movies_df[['title','cast','director','keywords','genres']].head()

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[dc comics, crime fighter, terrorist]","[Action, Crime, Drama]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[based on novel, mars, medallion]","[Action, Adventure, Science Fiction]"


In [4]:
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x,str):
            return str.lower(x.replace(" ",""))
        else:
            return ''
        
features = ['cast','keywords','director','genres']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(clean_data)
    
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

movies_df['soup'] = movies_df.apply(create_soup, axis=1)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words = 'english')
count_matrix = count.fit_transform(movies_df['soup'])
count_matrix

<4803x11520 sparse matrix of type '<class 'numpy.int64'>'
	with 42935 stored elements in Compressed Sparse Row format>

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
cos_similarity = cosine_similarity(count_matrix,count_matrix)
cos_similarity

array([[1. , 0.3, 0.2, ..., 0. , 0. , 0. ],
       [0.3, 1. , 0.2, ..., 0. , 0. , 0. ],
       [0.2, 0.2, 1. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 1. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 1. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ]])

In [7]:
def get_recommendations(title, cos_similarity=cos_similarity):
    idx = indices[title]
    sim_scores = list(enumerate(cos_similarity[idx]))
    sim_scores = sorted(sim_scores, key = lambda x:x[1],reverse = True)
    sim_scores = sim_scores[1:11]
    movies_indices = [i[0] for i in sim_scores]
    return movies_df['original_title'].iloc[movies_indices]

indices = pd.Series(movies_df.index, index=movies_df['original_title'])

In [8]:
get_recommendations('Avatar')

206                         Clash of the Titans
71        The Mummy: Tomb of the Dragon Emperor
786                                西游记之孙悟空三打白骨精
103                   The Sorcerer's Apprentice
131                                     G-Force
215                4: Rise of the Silver Surfer
466                            The Time Machine
715                           The Scorpion King
1      Pirates of the Caribbean: At World's End
5                                  Spider-Man 3
Name: original_title, dtype: object

In [9]:
get_recommendations('Newlyweds')

3805                Purple Violets
3837                 She's the One
1258     Life or Something Like It
4749         The Brothers McMullen
4247         Me You and Five Bucks
2597                    Confidence
1300                The Ugly Truth
2325    My Big Fat Greek Wedding 2
3989          Lage Raho Munna Bhai
1862                   Beauty Shop
Name: original_title, dtype: object

In [10]:
get_recommendations('The Hobbit: The Desolation of Smaug')

19              The Hobbit: The Battle of the Five Armies
98                      The Hobbit: An Unexpected Journey
262     The Lord of the Rings: The Fellowship of the Ring
329         The Lord of the Rings: The Return of the King
330                 The Lord of the Rings: The Two Towers
107                                              Warcraft
1686                                        The Borrowers
1322                                        City of Ember
71                  The Mummy: Tomb of the Dragon Emperor
786                                          西游记之孙悟空三打白骨精
Name: original_title, dtype: object

In [11]:
get_recommendations('The Amazing Spider-Man')

38                     The Amazing Spider-Man 2
9            Batman v Superman: Dawn of Justice
71        The Mummy: Tomb of the Dragon Emperor
786                                西游记之孙悟空三打白骨精
103                   The Sorcerer's Apprentice
131                                     G-Force
215                4: Rise of the Silver Surfer
715                           The Scorpion King
0                                        Avatar
1      Pirates of the Caribbean: At World's End
Name: original_title, dtype: object