In [145]:
pip install surprise



### Importing the required libraries

In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from google.colab import drive

### Mounting the Google Drive

In [147]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


### Reading the Meta dataset

In [148]:
meta_data = pd.read_csv("/content/drive/MyDrive/MOVIE_DATASET_BY_ROUNAK_BANIK/movies_metadata.csv")
meta_data.head()

  meta_data = pd.read_csv("/content/drive/MyDrive/MOVIE_DATASET_BY_ROUNAK_BANIK/movies_metadata.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [149]:
meta_data.iloc[1,3]

"[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]"

In [150]:
meta_data['genres'] = meta_data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [151]:
meta_data['genres']

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45461                 [Drama, Family]
45462                         [Drama]
45463       [Action, Drama, Thriller]
45464                              []
45465                              []
Name: genres, Length: 45466, dtype: object

1. Use the TMDB ratings to come up with the top movie chart
2. It can be mathematically represented as:
   Weighted Rating (WR) = (v/v+m).R + (m/v+m).C
   
   where,

   - v is the number of votes for the movie
   - m is the minimum number of votes to be considered for being listed in the chart
   - R is the average rating of the movie
   - C is the mean vote across the whole report


In [152]:
#getting the mean vote across the whole report
C = meta_data[meta_data['vote_average'].notnull()]['vote_average'].astype('int').mean()
C

5.244896612406511

In [153]:
#I am using 85th percentile as the cut off the define the minimum number of votes
m = meta_data[meta_data['vote_count'].notnull()]['vote_count'].astype('int').quantile(0.85)
m

82.0

In [154]:
#extracting the year of release
meta_data['year'] = pd.to_datetime(meta_data['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

### Calculating the weighted rating

In [155]:
def weighted_rating(movie):
  v = movie['vote_count']
  R = movie['vote_average']
  return (v/v+m)*R + (m/v+m)*C

In [156]:
#qualified movies for top rating
qualified_movies = meta_data[(meta_data['vote_count'].notnull()) & (meta_data['vote_average'].notnull()) & (meta_data['vote_count']>=m)]
qualified_movies = qualified_movies[['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres']]
qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
qualified_movies.shape

(6832, 6)

In [157]:
qualified_movies.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres
0,Toy Story,1995,5415,7,21.946943,"[Animation, Comedy, Family]"
1,Jumanji,1995,2413,6,17.015539,"[Adventure, Fantasy, Family]"
2,Grumpier Old Men,1995,92,6,11.7129,"[Romance, Comedy]"
4,Father of the Bride Part II,1995,173,5,8.387519,[Comedy]
5,Heat,1995,1886,7,17.924927,"[Action, Crime, Drama, Thriller]"


In [158]:
#getting the weighted-rating for the qualified movies
qualified_movies['weighted_rating'] = qualified_movies.apply(weighted_rating, axis=1)

In [159]:
qualified_movies = qualified_movies.sort_values( 'weighted_rating' , ascending = False )

In [160]:
qualified_movies.head()

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,weighted_rating
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,"[Comedy, Drama, Romance]",1177.732175
29183,The Jinx: The Life and Deaths of Robert Durst,2015,85,8,5.403881,[Documentary],1099.141305
5341,Z,1969,87,8,11.943621,"[Drama, History, Thriller]",1099.024988
41702,I Am Not Your Negro,2017,87,8,11.781919,[Documentary],1099.024988
32934,Silenced,2011,88,8,4.385574,[Drama],1098.968812


In [161]:
s = meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s

  s = meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


0        Animation
0           Comedy
0           Family
1        Adventure
1          Fantasy
           ...    
45461       Family
45462        Drama
45463       Action
45463        Drama
45463     Thriller
Length: 91106, dtype: object

### Function to get the poularity of the movie based on its genre

In [162]:
def build_chart(meta_data, genre, percentile=0.85):
  series_df = meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
  series_df.name = 'genre'
  modified_df = meta_data.drop('genres', axis=1).join(series_df)
  modified_df = modified_df[modified_df['genre'] == genre]
  qualified_movies = modified_df[(modified_df['vote_count'].notnull()) & (modified_df['vote_average'].notnull()) & (modified_df['vote_count']>=m)]
  qualified_movies = qualified_movies[['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genre']]
  qualified_movies['vote_count'] = qualified_movies['vote_count'].astype('int')
  qualified_movies['vote_average'] = qualified_movies['vote_average'].astype('int')
  #getting the weighted-rating for the qualified movies
  qualified_movies['weighted_rating'] = qualified_movies.apply(weighted_rating, axis=1)
  qualified_movies = qualified_movies.sort_values('weighted_rating', ascending=False)

  return qualified_movies



In [163]:
build_chart(meta_data,'Adventure').head(15)

  series_df = meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,weighted_rating
3350,Dersu Uzala,1975,90,8,4.713259,Adventure,1098.860206
2820,Sullivan's Travels,1941,100,8,8.64244,Adventure,1098.382337
7518,The Hidden Fortress,1958,117,8,5.624255,Adventure,1097.757433
9278,A Dog's Will,2000,120,8,23.950396,Adventure,1097.665535
2906,The General,1926,240,8,8.002953,Adventure,1095.873529
2884,Princess Mononoke,1997,2041,8,17.166725,Adventure,1094.292243
9698,Howl's Moving Castle,2004,2049,8,16.136048,Adventure,1094.29142
5481,Spirited Away,2001,3968,8,41.048867,Adventure,1094.18991
1154,The Empire Strikes Back,1980,5998,8,19.470959,Adventure,1094.153226
1225,Back to the Future,1985,6239,8,25.778509,Adventure,1094.150457


Studio Ghibli's movies 'Princess Mononoke', 'Howl's Moving Castle', 'the Empire Strikes Back' are among the top movies as per our defined metric
They are personally my favorite movies too.



In [164]:
build_chart(meta_data,'Romance').head(15)

  series_df = meta_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


Unnamed: 0,title,year,vote_count,vote_average,popularity,genre,weighted_rating
10309,Dilwale Dulhania Le Jayenge,1995,661,9,34.457024,Romance,1177.732175
14361,Love Exposure,2008,89,8,5.619382,Romance,1098.913899
3437,Limelight,1952,99,8,10.755881,Romance,1098.42578
2820,Sullivan's Travels,1941,100,8,8.64244,Romance,1098.382337
25718,Hotarubi no Mori e,2011,105,8,0.005007,Romance,1098.177537
7650,Hiroshima Mon Amour,1959,123,8,6.953676,Romance,1097.57812
6511,The Red Shoes,1948,124,8,10.44551,Romance,1097.549922
16694,Castaway on the Moon,2009,130,8,6.266701,Romance,1097.389842
45437,In a Heartbeat,2017,146,8,20.82178,Romance,1097.027286
24886,The Way He Looks,2014,262,8,5.711274,Romance,1095.723055


- The above mentioned method is flawed in detecting top movies because it does not provide personalized move recommendation as per users preferences.

- For instance let's say that I am a big fan of Studio Ghibli movies, in that case the recommendation engine must provide me with personal recommendations of movies such as 'Ocean waves', 'Whisper Of the heart' as per my preferences



To adress this issue, I am going to build a recommendation system that suggest movies that are more similar to a particular movie that user liked. Since we will be using meta-data for making such a system we will call it **Content Based Filtering**

## Content Based Recommender System

We will be building Content based Recommender for two cases:

- Case 1: With Movie Reviews and Taglines
- Case 2: With Cast, Crew, Keywords and Genre


In [165]:
links_small_meta_data = pd.read_csv("/content/drive/MyDrive/MOVIE_DATASET_BY_ROUNAK_BANIK/links_small.csv")
links_small_meta_data = links_small_meta_data[links_small_meta_data['tmdbId'].notnull()]['tmdbId'].astype('int')

In [166]:
links_small_meta_data.head()
meta_data = meta_data.drop([19730, 29503, 35587])
#Check EDA Notebook for how and why I got these indices.
meta_data['id'] = meta_data['id'].astype('int')

In [167]:
#due to limited computing power available I am using smaller_meta_data
small_meta_data = meta_data[meta_data['id'].isin(links_small_meta_data)]
small_meta_data.shape

(9099, 25)

### There are about 9000 movies available in the smaller meta-data

In [168]:
small_meta_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year'],
      dtype='object')

In [169]:
small_meta_data['tagline'] = small_meta_data['tagline'].fillna('')
small_meta_data['description'] = small_meta_data['tagline'] + small_meta_data['overview']
small_meta_data['description'] = small_meta_data['description'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['tagline'] = small_meta_data['tagline'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['description'] = small_meta_data['tagline'] + small_meta_data['overview']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['description'] = small_meta_data

In [170]:
tfidf_matrix = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english').fit_transform(small_meta_data['description'])

In [171]:
tfidf_matrix.shape

(9099, 269262)

### Calculating the cosine similarity to mesaure the degree of similarity between two vectors

In [172]:
#On L2-normalized data, cosine function is equivalent to linear_kernel.
cos_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cos_sim[0]

array([1.        , 0.00680204, 0.        , ..., 0.        , 0.00344826,
       0.        ])

In [173]:
#getting the recommenadtions of the top-30 movies based on cosine similarity
def get_recommendations(title,small_meta_data):
  small_meta_data = small_meta_data.reset_index()
  small_meta_data.head()
  titles = small_meta_data['title']
  indices = pd.Series(small_meta_data.index, index=small_meta_data['title'])
  idx  = indices[title]
  sim_scores = list(enumerate(cos_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:31]
  movie_indices = [i[0] for i in sim_scores]
  return titles.iloc[movie_indices]

In [174]:
get_recommendations('The Dark Knight',small_meta_data)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
8917         Batman v Superman: Dawn of 

Above given is the recommendation based on the movie content, the above system is not able to take care of cases where the preference must be based on cast, crew, directors etc.

Let's try to build a recommender system taking care of these

In [175]:
#getting the credits and keywords
credits = pd.read_csv('/content/drive/MyDrive/MOVIE_DATASET_BY_ROUNAK_BANIK/credits.csv')
keywords = pd.read_csv('/content/drive/MyDrive/MOVIE_DATASET_BY_ROUNAK_BANIK/keywords.csv')

In [176]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
meta_data['id'] = meta_data['id'].astype('int')


In [177]:
meta_data = meta_data.merge(credits, on='id')
meta_data = meta_data.merge(keywords, on='id')

In [178]:
small_meta_data = meta_data[meta_data['id'].isin(links_small_meta_data)]
small_meta_data.shape

(9219, 28)

Combining cast, crew, genres, and credits all in one dataframe
 1. From the crew we pick only the director since the other do not contribute towards the feel of the movie
 2. We arbitrarily select the top 3 actors that appear in the list

In [179]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [180]:
small_meta_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'cast', 'crew', 'keywords'],
      dtype='object')

In [181]:
small_meta_data['cast'] = small_meta_data['cast'].apply(literal_eval)
small_meta_data['crew'] = small_meta_data['crew'].apply(literal_eval)
small_meta_data['keywords'] = small_meta_data['keywords'].apply(literal_eval)
small_meta_data['cast_size'] = small_meta_data['cast'].apply(lambda x: len(x))
small_meta_data['crew_size'] = small_meta_data['crew'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['cast'] = small_meta_data['cast'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['crew'] = small_meta_data['crew'].apply(literal_eval)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['keywords'] = small_meta_data['keywords'].apply(l

In [182]:
small_meta_data['director'] = small_meta_data['crew'].apply(get_director)
small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['director'] = small_meta_data['crew'].apply(get_director)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta

In [183]:
#preprocessing of the data
# 1. Strip spaces and convert to lower case
small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
small_meta_data['director'] = small_meta_data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
# we will give more weight to the director relative to the outer cast
# inorder to do so, we make the replca of director 3 times
small_meta_data['director'] = small_meta_data['director'].apply(lambda x: [x,x, x])
# 2. we calculate the frequency of all the keywords that appear in the dataset, we then eliminate all the key words whose ocurrence is less than eual t0 1
small_data_freq = small_meta_data.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
small_data_freq = small_data_freq.value_counts()
small_data_freq = small_data_freq [small_data_freq >1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['cast'] = small_meta_data['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['director'] = small_meta_data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

In [184]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [185]:
#filtering out the key-words from the dataset
small_meta_data['keywords'] = small_meta_data['keywords'].apply(filter_keywords)
#performing stemming in the key words
stemmer = SnowballStemmer('english')
small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
#remove spaces and convert to lower case
small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['keywords'] = small_meta_data['keywords'].apply(filter_keywords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['keywords'] = small_meta_data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['key

In [186]:
small_meta_data['combine_detail'] = small_meta_data['keywords'] + small_meta_data['cast'] + small_meta_data['director'] + small_meta_data['genres']
small_meta_data['combine_detail'] = small_meta_data['combine_detail'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['combine_detail'] = small_meta_data['keywords'] + small_meta_data['cast'] + small_meta_data['director'] + small_meta_data['genres']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_meta_data['combine_detail'] = small_meta_data['combine_detail'].apply(lambda x: ' '.join(x))


In [187]:
small_meta_data['combine_detail']

0        tomhanks timallen donrickles johnlasseter john...
1        robinwilliams jonathanhyde kirstendunst joejoh...
2        waltermatthau jacklemmon ann-margret howarddeu...
3        whitneyhouston angelabassett lorettadevine for...
4        stevemartin dianekeaton martinshort charlesshy...
                               ...                        
40952    sidneypoitier wendycrewson jayo.sanders greggc...
41172    akshaykumar ileanad'cruz eshagupta tinusureshd...
41225    hrithikroshan poojahegde kabirbedi ashutoshgow...
41391    hirokihasegawa yutakatakenouchi satomiishihara...
41669    paulmccartney ringostarr johnlennon ronhoward ...
Name: combine_detail, Length: 9219, dtype: object

In [188]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(small_meta_data['combine_detail'])
cos_sim = cosine_similarity(count_matrix, count_matrix)



In [189]:
get_recommendations('The Dark Knight', small_meta_data).head(15)

8031      The Dark Knight Rises
6218              Batman Begins
6623               The Prestige
2085                  Following
4145                   Insomnia
3381                    Memento
8613               Interstellar
7648                  Inception
5943                   Thursday
8927    Kidnapping Mr. Heineken
3864               The Gauntlet
149                     Hackers
7561                Harry Brown
440           Menace II Society
628               Force of Evil
Name: title, dtype: object

The recommendations seem to have recognized other Christopher Nolan movies (due to the high weightage given to director), director of  'Dark Knight'
 and put them as top recommendations.