In [None]:
'''
A recommendation engine, also known as a recommender system, 
is software that analyzes available data to make suggestions for something that a user might be interested in.

There are basically four or three types of recommendation engines depending on who you ask:
1.Content based recommendation engine
2.Collaborative filtering based recommendation engine
3.Popularity based recommendation engine
4.Hybrid recommendation engine

The engine we will use is called a content based recommendation engine and it is 
a recommendation system that takes in a movie that a user likes 
and then analyzes it to get the movies content (e.g. genre, cast, director, keywords, etc.), 
it then ranks the recommended movies based on how similar the recommended movies 
are to the liked movie using something called similarity scores
'''

#Description: Build a movie recommendation engine (more specifically a content based recommendation engine)

#Resources: https://medium.com/code-heroku/building-a-movie-recommendation-engine-in-python-using-scikit-learn-c7489d7cb145

'\nA recommendation engine, also known as a recommender system, \nis software that analyzes available data to make suggestions for something that a user might be interested in.\n\nThere are basically four or three types of recommendation engines depending on who you ask:\n1.Content based recommendation engine\n2.Collaborative filtering based recommendation engine\n3.Popularity based recommendation engine\n4.Hybrid recommendation engine\n\nThe engine we will use is called a content based recommendation engine and it is \na recommendation system that takes in a movie that a user likes \nand then analyzes it to get the movies content (e.g. genre, cast, director, keywords, etc.), \nit then ranks the recommended movies based on how similar the recommended movies \nare to the liked movie using something called similarity scores\n'

In [None]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#Load the data
from google.colab import files # Use to load data on Google Colab
uploaded = files.upload() # Use to load data on Google Colab
df = pd.read_csv("movie_dataset.csv")

Saving movie_dataset.csv to movie_dataset.csv


In [None]:
#Print the first 3 rows of the data set
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


In [None]:
#Get a count of the number of rows/movies in the data set and the number of columns
df.shape

(4803, 24)

In [None]:
#Create a list of important columns to keep a.k.a. the main content of the movie
features = ['keywords','cast','genres','director']

df[features].head(3)

Unnamed: 0,keywords,cast,genres,director
0,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron
1,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski
2,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes


In [None]:
#Clean and preprocess the data
for feature in features:
    df[feature] = df[feature].fillna('') #Fill any missing values with the empty string
   # print(df[feature])
    

In [None]:
#A function to combine the values of the important columns into a single string
def combine_features(row):
    return row['keywords'] +" "+row['cast']+" "+row["genres"]+" "+row["director"]

In [None]:
#Apply the function to each row in the dataset to store the combined strings into a new column called combined_features 
df["combined_features"] = df.apply(combine_features,axis=1)
#df["combined_features"]

In [None]:
#Print the data frame to show the new column 'combined_features'
df.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,ocean drug abuse exotic island east india trad...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,spy based on novel secret agent sequel mi6 Dan...


In [None]:
#Convert a collection of text to a matrix/vector of token counts
count_matrix = CountVectorizer().fit_transform(df["combined_features"])

#Print the count matrix
#print(count_matrix.toarray())

In [None]:
#Get the cosine similarity matrix from the count matrix (cos(theta))
cosine_sim = cosine_similarity(count_matrix)

#Print the cosine similarity matrix
print(cosine_sim)

[[1.         0.10540926 0.12038585 ... 0.         0.         0.        ]
 [0.10540926 1.         0.0761387  ... 0.03651484 0.         0.        ]
 [0.12038585 0.0761387  1.         ... 0.         0.11145564 0.        ]
 ...
 [0.         0.03651484 0.         ... 1.         0.         0.04264014]
 [0.         0.         0.11145564 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.04264014 0.         1.        ]]


In [None]:
#Get the number of rows and columns in the data set
cosine_sim.shape

(4803, 4803)

In [None]:
#Helper function to get the title from the index
def get_title_from_index(index):
  return df[df.index == index]["title"].values[0]

#Helper function to get the index from the title
def get_index_from_title(title):
  return df[df.title == title]["index"].values[0]

In [None]:
#Get the title of the movie that the user likes
movie_user_likes = "Wrong Turn"

#Find that movies index
movie_index = get_index_from_title(movie_user_likes) 

In [None]:
#Access the row, through the movies index, corresponding to this movie (the liked movie) in the similarity matrix, 
# by doing this we will get the similarity scores of all other movies from the current movie

#Enumerate through all the similarity scores of that movie to make a tuple of movie index and similarity scores.
#  This will convert a row of similarity scores like this- [5 0.6 0.3 0.9] to this- [(0, 5) (1, 0.6) (2, 0.3) (3, 0.9)] . 
#  Note this puts each item in the list in this form (movie index, similarity score)
similar_movies =  list(enumerate(cosine_sim[movie_index]))



#Sort the list of similar movies according to the similarity scores in descending order
#Since the most similar movie is itself, we will discard the first element after sorting.
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [None]:
#Print the sorted similar movies to the movie the user like
# The tuples are in the form (movie_index, similarity value)
print(sorted_similar_movies)

[(4658, 0.31448545101657543), (2088, 0.28867513459481287), (2592, 0.28749445424997294), (2534, 0.28203803740888306), (1786, 0.264575131106459), (116, 0.2592814894208657), (448, 0.25077519565890877), (2149, 0.24999999999999994), (1577, 0.24743582965269678), (2705, 0.24743582965269678), (2187, 0.2456518422202587), (1839, 0.24174688920761406), (2561, 0.23759548165574573), (3342, 0.2364331218717302), (658, 0.2338535866733713), (3560, 0.23145502494313785), (670, 0.22677868380553634), (223, 0.22237479499833035), (3054, 0.21821789023599236), (1215, 0.21428571428571425), (1364, 0.21428571428571425), (2118, 0.21428571428571425), (1490, 0.2105587219030789), (217, 0.20701966780270625), (3813, 0.20701966780270623), (2832, 0.20044593143431824), (758, 0.19702760155977517), (1478, 0.19702760155977517), (1845, 0.19702760155977517), (2329, 0.19702760155977517), (1993, 0.1928791874526149), (114, 0.1889822365046136), (3390, 0.18641092980036), (759, 0.1853123291652753), (512, 0.181848241863327), (971, 0.1

In [None]:
#Create a loop to print the first 5 entries from the sorted similar movies list

i=0
print("Top 5 similar movies to "+movie_user_likes+" are:")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]) )
    i=i+1
    if i>=5:
        break

Top 5 similar movies to Wrong Turn are:
The Frozen
Pulse
Highlander: Endgame
When a Stranger Calls
Flatliners


In [None]:
#Create a loop to print the first 5 entries from the sorted similar movies list 
# and similarity scores

i=0
print("Top 5 similar movies to "+movie_user_likes+" are:")
for i in range( len(sorted_similar_movies)):
    print('Movie title:',get_title_from_index(sorted_similar_movies[i][0]), ', Similarity Score: ', sorted_similar_movies[i][1] )
    i=i+1
    if i>=5:
        break

Top 5 similar movies to The Amazing Spider-Man are:
Movie title: The Amazing Spider-Man 2 , Similarity Score:  0.5604485383178051
Movie title: Duma , Similarity Score:  0.2553769592276246
Movie title: Highlander: Endgame , Similarity Score:  0.241522945769824
Movie title: Cold Mountain , Similarity Score:  0.24077170617153842
Movie title: Spider-Man 2 , Similarity Score:  0.24019223070763074
