In [1]:
import ast
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pickle
import operator

In [2]:
# Getting data from csv files and making them into dataframes
movies=pd.read_csv('movies.csv')
credits=pd.read_csv('credits.csv')
links=pd.read_csv('links.csv')
ratings=pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [4]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [5]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
# Merging the dataframes to make df dataframe for movies details
df=movies.merge(credits,how='inner',left_on='id',right_on='movie_id')
df=df.merge(links,how='inner',left_on='id',right_on='tmdbId')
df.rename(columns = {'title_x':'title'}, inplace = True)
df=df.merge(ratings,how='inner',on='movieId')
df.rename(columns = {'id_x':'id'}, inplace = True)

# Getting the year from the release date column
df['year'] = pd.DatetimeIndex(df['release_date']).year

# Selecting columns needed
df=df[['id','title','year','overview','genres','keywords','cast','crew','vote_average','vote_count','movieId']]

# Dropping duplicates 
df.drop_duplicates(subset=['id'], keep='last',inplace=True,ignore_index=True)

In [8]:
df.head()

Unnamed: 0,id,title,year,overview,genres,keywords,cast,crew,vote_average,vote_count,movieId
0,19995,Avatar,2009,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",7.2,11800,72998
1,285,Pirates of the Caribbean: At World's End,2007,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",6.9,4500,53125
2,206647,Spectre,2015,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",6.3,4466,136020
3,49026,The Dark Knight Rises,2012,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",7.6,9106,91529
4,49529,John Carter,2012,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",6.1,2124,93363


In [9]:
# Checking if any columns contain null values
df.isnull().sum()

id              0
title           0
year            0
overview        0
genres          0
keywords        0
cast            0
crew            0
vote_average    0
vote_count      0
movieId         0
dtype: int64

In [10]:
# Dropping duplicates if any
df.drop_duplicates(subset='title', inplace=True, ignore_index=True)

In [11]:
# Checking if any duplicates exist
df.duplicated().sum()

0

In [12]:
# Merging the dataframes to make ratings dataframe for ratings details
ratings=ratings.merge(df,how='inner',on='movieId')

# Selecting columns needed
ratings=ratings[['id','title','userId','movieId','rating',]]

In [13]:
ratings.head(5)

Unnamed: 0,id,title,userId,movieId,rating
0,819,Sleepers,1,1061,3.0
1,819,Sleepers,19,1061,3.0
2,819,Sleepers,23,1061,3.5
3,819,Sleepers,30,1061,3.0
4,819,Sleepers,70,1061,5.0


In [14]:
# Checking if any columns contain null values
ratings.isnull().sum() 

id         0
title      0
userId     0
movieId    0
rating     0
dtype: int64

In [15]:
# Checking if any duplicates exist
ratings.duplicated().sum()

0

In [16]:
# Checking if all ratings are numeric datatype are not
print(is_numeric_dtype(ratings['rating']))

True


In [17]:
# Checking if all userids are numeric datatype are not
print(is_numeric_dtype(ratings['userId']))

True


In [18]:
# Resetting index 
ratings.reset_index(drop=True, inplace=True)

In [19]:
# Getting names from the dictionaries (For genres and keywords)
def convert(obj):
    list=[]
    for i in ast.literal_eval(obj):
        list.append(i['name'])
    return list

In [20]:
df['genres']=df['genres'].apply(convert)
df['keywords']=df['keywords'].apply(convert)

In [21]:
df.head(1)

Unnamed: 0,id,title,year,overview,genres,keywords,cast,crew,vote_average,vote_count,movieId
0,19995,Avatar,2009,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",7.2,11800,72998


In [22]:
# Getting top 5 cast names
def convert_cast(obj):
    list=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=5:
            list.append(i['name'])
            counter+=1
        else:
            break
    return list

In [23]:
df['cast']=df['cast'].apply(convert_cast)

In [24]:
df.head(1)

Unnamed: 0,id,title,year,overview,genres,keywords,cast,crew,vote_average,vote_count,movieId
0,19995,Avatar,2009,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",7.2,11800,72998


In [25]:
# Getting director name from crew
def get_director(obj):
    list=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            list.append(i['name'])
            break
    return list

In [26]:
df['crew']=df['crew'].apply(get_director)

In [27]:
df.head(1)

Unnamed: 0,id,title,year,overview,genres,keywords,cast,crew,vote_average,vote_count,movieId
0,19995,Avatar,2009,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],7.2,11800,72998


In [28]:
# Converting overview string to list
df['overview']=df['overview'].apply(lambda x:x.split())

In [29]:
df.head(5)

Unnamed: 0,id,title,year,overview,genres,keywords,cast,crew,vote_average,vote_count,movieId
0,19995,Avatar,2009,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],7.2,11800,72998
1,285,Pirates of the Caribbean: At World's End,2007,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski],6.9,4500,53125
2,206647,Spectre,2015,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes],6.3,4466,136020
3,49026,The Dark Knight Rises,2012,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan],7.6,9106,91529
4,49529,John Carter,2012,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton],6.1,2124,93363


In [30]:
# Removing spaces between words that are together
df['genres']=df['genres'].apply(lambda x:[i.replace(" ","") for i in x])
df['keywords']=df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
df['cast']=df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
df['crew']=df['crew'].apply(lambda x:[i.replace(" ","") for i in x])
df['overview']=df['overview'].apply(lambda x:[i.replace(" ","") for i in x])

In [31]:
# Combining all these columns to get a new column
df['tags']=df['cast']+df['crew']+df['genres']+df['keywords']+df['overview']

In [32]:
df.head(1)

Unnamed: 0,id,title,year,overview,genres,keywords,cast,crew,vote_average,vote_count,movieId,tags
0,19995,Avatar,2009,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],7.2,11800,72998,"[SamWorthington, ZoeSaldana, SigourneyWeaver, ..."


In [33]:
# Joining all words in list to a string
df.loc[:,'tags']=df.loc[:,'tags'].apply(lambda x:" ".join(x))

In [34]:
df['tags'].head(3)

0    SamWorthington ZoeSaldana SigourneyWeaver Step...
1    JohnnyDepp OrlandoBloom KeiraKnightley Stellan...
2    DanielCraig ChristophWaltz LéaSeydoux RalphFie...
Name: tags, dtype: object

In [35]:
# Changing the case to lowercase
df.loc[:,'tags']=df.loc[:,'tags'].apply(lambda x:x.lower())

In [36]:
df['tags'].head(3)

0    samworthington zoesaldana sigourneyweaver step...
1    johnnydepp orlandobloom keiraknightley stellan...
2    danielcraig christophwaltz léaseydoux ralphfie...
Name: tags, dtype: object

In [37]:
# Using CountVectorizer and cosine_similarity
cv=CountVectorizer(stop_words='english')
vectors=cv.fit_transform(df['tags']).toarray()
similarity=cosine_similarity(vectors)

In [38]:
# Copying similarity in a pickle object
pickle.dump(similarity,open('similarity.pkl','wb'))

In [39]:
# Copying df in a pickle object by converting it to a dict
pickle.dump(df.to_dict(),open('df_dict.pkl','wb'))

In [40]:
def Content_filtering(movie):
    n = 10
    # Getting index of 'movie'
    movie_index = df[df['title']==movie].index[0]
    #Getting the similarity of 'movie' using it's index
    distances = similarity[movie_index]
    # Sorting it in descending order
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:n+1]
    
    content = []
    print("Recommended Movies:")
    for i in movies_list:
        content.append(df.iloc[i[0]].title)
    return content

In [41]:
Content_filtering('Avatar')

Recommended Movies:


['Titan A.E.',
 'Aliens vs Predator: Requiem',
 'Aliens',
 'Independence Day',
 'Battle: Los Angeles',
 'Predators',
 'Lifeforce',
 "Ender's Game",
 'Small Soldiers',
 'Edge of Tomorrow']

In [42]:
Content_filtering('Batman Begins')

Recommended Movies:


['The Dark Knight',
 'The Dark Knight Rises',
 'Batman',
 'Batman & Robin',
 'Batman v Superman: Dawn of Justice',
 'Batman Forever',
 'Defendor',
 'Batman Returns',
 'Teenage Mutant Ninja Turtles',
 'Brick Mansions']

In [43]:
Content_filtering('Captain America: The First Avenger')

Recommended Movies:


['Captain America: Civil War',
 'Captain America: The Winter Soldier',
 'Saints and Soldiers',
 'The Avengers',
 'Red Tails',
 'The Great Raid',
 'Letters from Iwo Jima',
 'U-571',
 'Tora! Tora! Tora!',
 'Iron Man 3']

In [44]:
Content_filtering('Titanic')

Recommended Movies:


['The Notebook',
 'Ghost Ship',
 'Captain Phillips',
 'Poseidon',
 'The Bounty',
 'Pirates of the Caribbean: On Stranger Tides',
 'Supernova',
 'Triangle',
 'The Black Hole',
 'Master and Commander: The Far Side of the World']

In [45]:
Content_filtering('Enchanted')

Recommended Movies:


['Aladdin',
 'Snow White and the Seven Dwarfs',
 'About Last Night',
 'Return to Never Land',
 'Pocahontas',
 'Into the Woods',
 'Frozen',
 'Vamps',
 'Mirror Mirror',
 'The Princess Bride']

In [46]:
def Popularity_year(movie):
    n = 5
    # Getting index of 'movie'
    movie_index = df[df['title']==movie].index[0]
    # Getting release year of the 'movie'
    needed_year = df['year'][movie_index]
    # Getting movies released in the same year
    result = df[df['year']==needed_year]
    # Sorting by vote_average in descending order
    data = result.sort_values(by='vote_average',ascending=False)
    
    pop_year = []
    counter = 0
    print("{} is released in {}".format(movie, needed_year))
    print("\nRecommended Movies:")
    
    for ind in data.index:
        if counter < n:
            pop_year.append(data["title"][ind])
            counter += 1
    return pop_year

In [47]:
Popularity_year('Avatar')

Avatar is released in 2009

Recommended Movies:


['Inglourious Basterds',
 'The Secret in Their Eyes',
 "Hachi: A Dog's Tale",
 'Up',
 'Moon']

In [48]:
Popularity_year('Batman Returns')

Batman Returns is released in 1992

Recommended Movies:


['Reservoir Dogs',
 'Unforgiven',
 'Glengarry Glen Ross',
 'Aladdin',
 'Army of Darkness']

In [49]:
Popularity_year('Captain America: The First Avenger')

Captain America: The First Avenger is released in 2011

Recommended Movies:


['Samsara',
 'The Help',
 'Warrior',
 'A Separation',
 'We Need to Talk About Kevin']

In [50]:
Popularity_year('Titanic')

Titanic is released in 1997

Recommended Movies:


['Princess Mononoke',
 'Good Will Hunting',
 'Children of Heaven',
 'Dream with the Fishes',
 'Character']

In [51]:
Popularity_year('Enchanted')

Enchanted is released in 2007

Recommended Movies:


['There Will Be Blood',
 'Into the Wild',
 'Elite Squad',
 'The Man from Earth',
 'No Country for Old Men']

In [52]:
def Popularity_based(n):
    if n >= 1 and n <= len(df):
        # Getting movies whose vote_count are more than 5000
        result = df[df['vote_count']>5000]
        # Sorting by vote_average in descending order
        data = result.sort_values(by='vote_average',ascending=False)
        
        pop = []
        counter = 0
        for ind in data.index:
            if counter < n:
                pop.append(data["title"][ind])
                counter += 1
        return pop
    return "Invalid number of movies entered!!"

In [53]:
print("Top 10 Popular movies are: ")
Popularity_based(10)

Top 10 Popular movies are: 


['The Shawshank Redemption',
 'The Godfather',
 'Pulp Fiction',
 'Fight Club',
 'The Dark Knight',
 'The Empire Strikes Back',
 'Forrest Gump',
 'Star Wars',
 'The Lord of the Rings: The Return of the King',
 'Se7en']

In [54]:
def Weighted_mean(n):
    v = df['vote_count'] # number of ratings
    r = df['vote_average']  # rating of the movie
    c = df['vote_average'].mean() # mean rating
    m = df['vote_count'].quantile(0.90) # minimum votes required 
    # adding a new column weighted_mean whose values are calculated using formula (v*r/(v+m)) + (m*c/(m+v))
    df['weighted_mean'] = (v*r/(v+m)) + (m*c/(m+v))
    
    print("Recommended Movies:")
    if n >= 1 and n <= len(df):
        # Sorting by weighted_mean in descending order
        data = df.sort_values('weighted_mean',ascending=False)
        weighted = []
        counter = 0
        for ind in data.index:
            if counter < n:
                weighted.append(data["title"][ind])
                counter += 1
        return weighted
    return "Invalid number of movies entered!!"


In [55]:
Weighted_mean(10)

Recommended Movies:


['The Shawshank Redemption',
 'Fight Club',
 'The Dark Knight',
 'Pulp Fiction',
 'Inception',
 'The Godfather',
 'Interstellar',
 'Forrest Gump',
 'The Lord of the Rings: The Return of the King',
 'The Empire Strikes Back']

In [56]:
# Creating a new dataframe to have the number of ratings a movie has
r = pd.DataFrame(ratings['movieId'].value_counts())
r['count'] = r['movieId']
r['movieId'] = r.index
r.reset_index(level=0, inplace=True)
r = r.drop('index',axis=1)

In [57]:
r.head(10)

Unnamed: 0,movieId,count
0,356,341
1,296,324
2,318,311
3,593,304
4,260,291
5,480,274
6,2571,259
7,1,247
8,527,244
9,589,237


In [58]:
# Merging dataframes so ratings dataframe has 'count' column
ratings=ratings.merge(r,how='inner',on='movieId')
ratings.sort_values(by='userId', axis=0, ascending=True, inplace=True, ignore_index=True)

In [59]:
ratings.head(5)

Unnamed: 0,id,title,userId,movieId,rating,count
0,819,Sleepers,1,1061,3.0,33
1,783,Gandhi,1,1293,2.0,46
2,11072,Blazing Saddles,1,3671,3.0,62
3,1103,Escape from New York,1,1129,2.0,48
4,36819,Time Bandits,1,2968,1.0,43


In [60]:
# Creating a new dataframe so as to groupby using ids
testdf= ratings[['id','userId','rating']]
testdf = testdf[['userId','rating']].groupby(testdf['id'])

In [61]:
# Copying ratings in a pickle object by converting it to a dict
pickle.dump(ratings.to_dict(),open('ratings_dict.pkl','wb'))

In [62]:
listOfDictionaries=[]
indexMap = {}
reverseIndexMap = {}
ptr=0

for groupKey in testdf.groups.keys():
    tempDict={}
    groupDF = testdf.get_group(groupKey)
    for i in range(0,len(groupDF)):
        tempDict[groupDF.iloc[i,0]] = groupDF.iloc[i,1]
    indexMap[ptr]=groupKey
    reverseIndexMap[groupKey] = ptr
    ptr=ptr+1
    listOfDictionaries.append(tempDict)

# using DictVectorizer and cosine_similarity
dictVectorizer = DictVectorizer(sparse=True)
vector = dictVectorizer.fit_transform(listOfDictionaries)
pairwiseSimilarity = cosine_similarity(vector)

In [63]:
def Collaborative_filtering(movie):
    n=10
    k = list(df['title'])
    m = list(df['id'])
    # Getting id of 'movie'
    movie_id=m[k.index(movie)]
    collaborative = []
    
    row = reverseIndexMap[movie_id]
    
    counter = 0
    similar=[]
    similar.append(df[df['id']==movie_id]['title'].values[0])
    for i in np.argsort(pairwiseSimilarity[row])[:-2][::-1]:
          if ratings[ratings['id']==indexMap[i]]['title'].values[0] not in similar:
                if counter<n:
                    counter+=1
                    similar.append(ratings[ratings['id']==indexMap[i]]['title'].values[0])
                    collaborative.append(ratings[ratings['id']==indexMap[i]]['title'].values[0])
                
    return collaborative


In [64]:
Collaborative_filtering('Avatar')

['Iron Man',
 'The Dark Knight',
 'District 9',
 'Star Trek',
 'Sherlock Holmes',
 'Up',
 'The Dark Knight Rises',
 'WALL·E',
 'The Avengers',
 'The Hangover']

In [65]:
Collaborative_filtering('Batman Returns')

['Die Hard 2',
 'Highlander',
 'Lethal Weapon 3',
 'Snake Eyes',
 'Batman',
 'Wild Wild West',
 'The Lost World: Jurassic Park',
 'Indiana Jones and the Temple of Doom',
 'Mars Attacks!',
 'Indiana Jones and the Last Crusade']

In [66]:
Collaborative_filtering('Captain America: The First Avenger')

['Captain America: The Winter Soldier',
 'Thor',
 'Guardians of the Galaxy',
 'Ant-Man',
 'Sherlock Holmes: A Game of Shadows',
 'Star Trek',
 'Iron Man 2',
 'X-Men: First Class',
 'The Avengers',
 'Edge of Tomorrow']

In [67]:
Collaborative_filtering('Titanic')

['The Sixth Sense',
 'The Truman Show',
 'Forrest Gump',
 'Saving Private Ryan',
 'Jerry Maguire',
 "There's Something About Mary",
 'Good Will Hunting',
 'Back to the Future',
 'The Matrix',
 'Shrek']

In [68]:
Collaborative_filtering('Enchanted')

['Stardust',
 'Yours, Mine and Ours',
 'Dreamgirls',
 'The Spectacular Now',
 'Bolt',
 'A Single Man',
 'Flushed Away',
 'Ghost Town',
 'Just Like Heaven',
 'Spider-Man 3']

In [69]:
# Creating a pivot_table with id, userId and ratings
matrix = ratings.pivot_table(index='userId', columns='id', values = 'rating').fillna(0)

In [70]:
matrix

id,5,11,12,13,14,16,18,19,20,22,...,325173,328111,328425,329833,332411,332567,333371,334074,342521,347969
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,3.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
# Creating a new dataframe with rating average and count
avg_rating = pd.DataFrame(ratings.groupby('id')['rating'].mean())
avg_rating['ratingCount'] = pd.DataFrame(ratings.groupby('id')['rating'].count())
avg_rating.sort_values('ratingCount', ascending=False).head(5)

Unnamed: 0_level_0,rating,ratingCount
id,Unnamed: 1_level_1,Unnamed: 2_level_1
13,4.054252,341
680,4.256173,324
278,4.487138,311
274,4.138158,304
11,4.221649,291


In [72]:
# Copying avg_rating in a pickle object by converting it to a dict
pickle.dump(avg_rating.to_dict(),open('avg_rating_dict.pkl','wb'))

In [73]:
def Correlative_filtering(movie):
    n=10
    # Get id of 'movie'
    id = df.loc[df['title'] == movie].reset_index(drop = True).iloc[0]['id']
    row = matrix[id]
    # Get pairwise correlation of 'movie'
    correlation = pd.DataFrame(matrix.corrwith(row), columns = ['Pearson Corr'])
    corr = correlation.join(avg_rating['ratingCount'])
    
    # Sorting by 'Pearson Corr' in descending order
    res = corr.sort_values('Pearson Corr', ascending=False).head(n+1)[1:].index
    correlation=[]
    print("Recommended Movies:")
    for i in res:
        correlation.append(df.loc[df['id'] == i].reset_index(drop = True).iloc[0]['title'])
    
    return correlation


In [74]:
Correlative_filtering('Avatar')

Recommended Movies:


['Iron Man',
 'Inception',
 'Sherlock Holmes',
 'District 9',
 'Star Trek',
 'The Dark Knight',
 'Up',
 'The Dark Knight Rises',
 'The Hangover',
 'The Avengers']

In [75]:
Correlative_filtering('Batman Returns')

Recommended Movies:


['Batman & Robin',
 'Snake Eyes',
 'Highlander',
 'Lethal Weapon 3',
 'Die Hard 2',
 'Wild Wild West',
 'The Lost World: Jurassic Park',
 'From Dusk Till Dawn',
 'Deep Impact',
 'Mars Attacks!']

In [76]:
Correlative_filtering('Captain America: The First Avenger')

Recommended Movies:


['X-Men Origins: Wolverine',
 'Captain America: The Winter Soldier',
 'Thor',
 'Ant-Man',
 'Guardians of the Galaxy',
 'Sherlock Holmes: A Game of Shadows',
 'Iron Man 2',
 'Star Trek',
 'X-Men: First Class',
 'Edge of Tomorrow']

In [77]:
Correlative_filtering('Titanic')

Recommended Movies:


['Men in Black',
 'The Sixth Sense',
 'The Truman Show',
 'Jerry Maguire',
 "There's Something About Mary",
 'Saving Private Ryan',
 "You've Got Mail",
 "Ocean's Eleven",
 'Good Will Hunting',
 "My Best Friend's Wedding"]

In [78]:
Correlative_filtering('Enchanted')

Recommended Movies:


['The Fault in Our Stars',
 'Stardust',
 'Yours, Mine and Ours',
 'Dreamgirls',
 'The Spectacular Now',
 'Bolt',
 'A Single Man',
 'Flushed Away',
 'Ghost Town',
 'The Upside of Anger']

In [79]:
# Creating a new dataframe by using groupby id with id and count as columns
data = (ratings.groupby(by = ['id'])['rating'].count().reset_index().
        rename(columns = {'rating': 'count'})[['id', 'count']])
# Merging data and ratings
result = pd.merge(data, ratings, on='id')
# Creating a pivot_table with result dataframe
matrixn = result.pivot_table(index = 'id', columns = 'userId', values = 'rating').fillna(0)
# Creating a csr_matrix
up_matrix = csr_matrix(matrixn)

In [80]:
def Nearest_neighbours(movie):
    n=10
    # Getting 'id' of movie
    id = df.loc[df['title'] == movie].reset_index(drop = True).iloc[0]['id']
    # Using NearestNeighbors with cosine metric and brute force algorithm
    model = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model.fit(up_matrix)
    # Getting distances and indices of n+1 neighbors as the first one will be itself
    distances, indices = model.kneighbors(matrixn.loc[id].values.reshape(1, -1), n_neighbors = n+1)
    neighbours=[]
    print("Recommended Movies:")
    for i in range(0, len(distances.flatten())):
        if i > 0:
            required_id=matrixn.index[indices.flatten()[i]]
            name = df.loc[df['id'] == required_id].reset_index(drop = True).iloc[0]['title']
            neighbours.append(name)
    return neighbours

In [81]:
Nearest_neighbours('Avatar')

Recommended Movies:


['Inception',
 'Iron Man',
 'The Dark Knight',
 'District 9',
 'Star Trek',
 'Sherlock Holmes',
 'Up',
 'The Dark Knight Rises',
 'WALL·E',
 'The Avengers']

In [82]:
Nearest_neighbours('Batman Returns')

Recommended Movies:


['Batman & Robin',
 'Die Hard 2',
 'Highlander',
 'Lethal Weapon 3',
 'Snake Eyes',
 'Batman',
 'Wild Wild West',
 'The Lost World: Jurassic Park',
 'Indiana Jones and the Temple of Doom',
 'Mars Attacks!']

In [83]:
Nearest_neighbours('Captain America: The First Avenger')

Recommended Movies:


['X-Men Origins: Wolverine',
 'Captain America: The Winter Soldier',
 'Thor',
 'Guardians of the Galaxy',
 'Ant-Man',
 'Sherlock Holmes: A Game of Shadows',
 'Star Trek',
 'Iron Man 2',
 'X-Men: First Class',
 'The Avengers']

In [84]:
Nearest_neighbours('Titanic')

Recommended Movies:


['Men in Black',
 'The Sixth Sense',
 'The Truman Show',
 'Forrest Gump',
 'Saving Private Ryan',
 'Jerry Maguire',
 "There's Something About Mary",
 'Good Will Hunting',
 'Back to the Future',
 'The Matrix']

In [85]:
Nearest_neighbours('Enchanted')

Recommended Movies:


['The Fault in Our Stars',
 'Stardust',
 'Yours, Mine and Ours',
 'Dreamgirls',
 'The Spectacular Now',
 'Bolt',
 'A Single Man',
 'Flushed Away',
 'Ghost Town',
 'Just Like Heaven']

In [86]:
def Hybrid_model(movie):
    n=10
    z = []
    k = float(1/n)
    for x in range(n):
          z.append(1-k*x)

    content=Content_filtering(movie)
    collab=Collaborative_filtering(movie)
    dictid = {}
    
    for x in collab:
          dictid[x] = z[collab.index(x)]

    for x in content:
        if x not in dictid:
            dictid[x] = z[content.index(x)]
        else:
            dictid[x] += z[content.index(x)]
    
    # Sorting dictid with z in descending order
    id = dict(sorted(dictid.items(),key=operator.itemgetter(1),reverse=True))
    counter=0
    hybrid=[]
    for x in id.keys():
        if counter<n:
            hybrid.append(x)
            counter+=1
    return hybrid

In [87]:
Hybrid_model('Avatar')

Recommended Movies:


['Iron Man',
 'Titan A.E.',
 'The Dark Knight',
 'Aliens vs Predator: Requiem',
 'District 9',
 'Aliens',
 'Star Trek',
 'Independence Day',
 'Sherlock Holmes',
 'Battle: Los Angeles']

In [88]:
Hybrid_model('Batman Returns')

Recommended Movies:


['Batman',
 'Die Hard 2',
 'The Dark Knight Rises',
 'Highlander',
 'Lethal Weapon 3',
 'The Dark Knight',
 'Snake Eyes',
 'Batman: The Dark Knight Returns, Part 2',
 'Batman Begins',
 'Wild Wild West']

In [89]:
Hybrid_model('Captain America: The First Avenger')

Recommended Movies:


['Captain America: The Winter Soldier',
 'Captain America: Civil War',
 'Thor',
 'The Avengers',
 'Guardians of the Galaxy',
 'Saints and Soldiers',
 'Ant-Man',
 'Sherlock Holmes: A Game of Shadows',
 'Red Tails',
 'Star Trek']

In [90]:
Hybrid_model('Titanic')

Recommended Movies:


['The Sixth Sense',
 'The Notebook',
 'The Truman Show',
 'Ghost Ship',
 'Forrest Gump',
 'Captain Phillips',
 'Saving Private Ryan',
 'Poseidon',
 'Jerry Maguire',
 'The Bounty']

In [91]:
Hybrid_model('Enchanted')

Recommended Movies:


['Stardust',
 'Aladdin',
 'Yours, Mine and Ours',
 'Snow White and the Seven Dwarfs',
 'Dreamgirls',
 'About Last Night',
 'The Spectacular Now',
 'Return to Never Land',
 'Bolt',
 'Pocahontas']