In [69]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [70]:
data = pd.read_csv("moviedataset.csv")      #https://grouplens.org/datasets/movielens/
data.head(3)

#we have total 23 columns

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


In [71]:
features = ['keywords','cast','genres','director']

#if we have book recommendation system we can consider title,author, publisher of the book, similarly 
#we can consider " 'keywords','cast','genres','director'" as a input features (content).

In [72]:
def combine_features(row):
 return row['keywords']+" "+row['cast']+" "+row["genres"]+" "+row["director"]

In [73]:
for feature in features:
    data[feature] = data[feature].fillna(' ')

In [74]:
data["combined_features"] = data.apply(combine_features,axis=1)

In [75]:
countv = CountVectorizer()
count_matrix = countv.fit_transform(data["combined_features"])

#Convert a collection of text documents to a matrix of token counts

In [76]:
cosine_sim = cosine_similarity(count_matrix)

#use cosine similarity in X and Y.

#following techniques are available for similarity.
#Jaccard Similarity ☹☹☹
#Different embeddings+ K-means ☹☹
#Different embeddings+ Cosine Similarity ☹
#Word2Vec

In [77]:
def get_title_from_index(index):
    return data[data.index == index]["title"].values[0]

def get_index_from_title(title):
    return data[data.title == title]["index"].values[0]

In [78]:
movie_user_likes = "Avengers: Age of Ultron"
#write your favourite movie here.

movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [79]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [80]:
i=0
print("Top 7 content based movies to "+movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i=i+1
    if i>7:
        break
        
#you can print first 7 content based movies.

Top 7 content based movies to Avengers: Age of Ultron are:

The Avengers
Iron Man 2
Captain America: Civil War
Captain America: The Winter Soldier
Thor: The Dark World
Ant-Man
X-Men
X2
