## Imports

In [68]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# importing the csv file and read it
movie_data = pd.read_csv('movies.csv', low_memory=False)
movie_data

Unnamed: 0,adult,budget,id,original_title,overview,popularity,production_companies,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average
0,False,30000000,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios',10/30/1995,373554033,81,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7
1,False,90000000,863,Toy Story 2,"Andy heads off to Cowboy Camp, leaving his toy...",17.547693,Pixar Animation Studios',10/30/1999,497366869,92,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The toys are back!,Toy Story 2,7.3
2,False,200000000,10193,Toy Story 3,"Woody, Buzz, and the rest of Andy's toys haven...",16.96647,"Walt Disney Pictures', 'Pixar Animation Studios'",6/16/2010,1066969703,103,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No toy gets left behind.,Toy Story 3,7.6
3,False,0,213121,Toy Story of Terror!,What starts out as a fun road trip for the Toy...,0.512025,"Walt Disney Picture' ,'Pixar Animation Studios'",10/15/2013,0,22,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,One toy gets left behind!,Toy Story of Terror!,7.3
4,False,0,256835,Toy Story That Time Forgot,"During a post-Christmas play date, the gang fi...",8.609555,'Pixar Animation Studios',12/2/2014,0,22,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story That Time Forgot,6.8
5,False,55000000,10530,Pocahontas,History comes gloriously to life in Disney's e...,13.280069,"Walt Disney Pictures', 'Walt Disney Feature An...",6/14/1995,346079773,81,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,An American legend comes to life.,Pocahontas,6.7
6,False,38000000,10534,White Squall,Teenage boys discover discipline and camarader...,9.568604,"'Hollywood Pictures', 'Largo Entertainment',...",2/2/1996,10300000,129,"[{'iso_639_1': 'da', 'name': 'Dansk'}, {'iso_6...",Released,,White Squall,6.3
7,False,1300000,103,Taxi Driver,A mentally unstable Vietnam War veteran works ...,14.092713,"'Columbia Pictures Corporation', 'Italo/Judeo...",2/7/1976,28262574,114,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,"On every street in every city, there's a nobod...",Taxi Driver,8.1
8,False,35000000,43566,Before and After,Two parents deal with the effects when their s...,2.938293,"Caravan Pictures','Hollywood Pictures'",2/23/1996,0,108,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A murder. A suspect. A shadow of a doubt.,Before and After,5.8
9,False,0,51352,Anne Frank Remembered,Using previously unreleased archival material ...,0.243986,,6/8/1995,1,117,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"She is perhaps Hitler's best known victim, but...",Anne Frank Remembered,7.3


## Define vectorizer

In [69]:
# create a tf idf vectorizer
tfidf_vector = TfidfVectorizer(stop_words='english')

# fill null values in overview column with blanks
movie_data['overview'] = movie_data['overview'].fillna('')

# tf idf values matrix
tfidf_matrix = tfidf_vector.fit_transform(movie_data['overview'])

## Similarity matrix

In [70]:
sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

## Reverse map of the indices

In [71]:
# creating a reverse map of indices by taking only the unique movie names
indices = pd.Series(movie_data.index, index=movie_data['title']).drop_duplicates()
# displaying the indices
indices

title
Toy Story                     0
Toy Story 2                   1
Toy Story 3                   2
Toy Story of Terror!          3
Toy Story That Time Forgot    4
Pocahontas                    5
White Squall                  6
Taxi Driver                   7
Before and After              8
Anne Frank Remembered         9
dtype: int64

## Implementing recommender function

In [72]:
def content_based_recommender(title, sim_scores = sim_matrix):
    # get the index of the movie having the given title
    idx = indices[title]

    # list of similarity scores along with index of movie for the given title
    sim_scores = list(enumerate(sim_matrix[idx]))

    # sorting the obtained similarity scores by descending order
    # highest similarity score first lowest the last
    sim_scores = sorted(sim_scores, reverse=True, key=lambda x: x[1])

    # all the values except the first value
    sim_scores = sim_scores[1:10]

    # get the only indexes from the sim_scores list (without the values)
    movie_indices = [i[0] for i in sim_scores]

    # return the movie index and movie title of the similar movies
    return movie_data['title'].loc[movie_indices]

In [73]:
content_based_recommender('Toy Story')

2                   Toy Story 3
1                   Toy Story 2
4    Toy Story That Time Forgot
3          Toy Story of Terror!
5                    Pocahontas
6                  White Squall
7                   Taxi Driver
8              Before and After
9         Anne Frank Remembered
Name: title, dtype: object