This is the extension of previous simple recommender system where here instead of using lightfm inbuilt dataset we used 
two datasets on movies 
on these we did content-based ,collabourative and hybrid recommender systems

# Content Based Filtering
In this recommender system the content of the movie (overview, cast, crew, keyword, tagline etc) is used to find its similarity with other movies. Then the movies that are most likely to be similar are recommended.

In [5]:
#content based filtering
import pandas as pd 
import numpy as np 
df1=pd.read_csv('tmdb_5000_credits.csv')
df2=pd.read_csv('tmdb_5000_movies.csv')
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')
df2['overview'].head(5)
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2['title'].iloc[movie_indices]
get_recommendations('The Dark Knight Rises')

65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title, dtype: object

It has done a decent job of finding movies with similar plot descriptions, 
the quality of recommendations is not that great so we improve that using Credits, Genres and Keywords Based Recommender

# Credits, Genres and Keywords Based Recommender

here we are going to build a recommender based on the following metadata:
    the 3 top actors, the director, related genres and the movie plot keywords.so that our recommender accuracy increases.

In [6]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)
#Next, we'll write functions that will help us to extract the required information from each feature.

# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []
# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)
# Print the new features of the first 3 films
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)
# Reset index of our main DataFrame and construct reverse mapping as before
df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])
get_recommendations('The Dark Knight Rises', cosine_sim2)

65               The Dark Knight
119                Batman Begins
4638    Amidst the Devil's Wings
1196                The Prestige
3073           Romeo Is Bleeding
3326              Black November
1503                      Takers
1986                      Faster
303                     Catwoman
747               Gangster Squad
Name: title, dtype: object

# Collaborative Filtering 

Collaborative filters can further be classified into two types:

User-based Filtering: these systems recommend products to a user that similar users have liked
Item-based Filtering: these systems are extremely similar to the content recommendation engine that you built. 
These systems identify similar items based on how people have rated it in the past. 

In [20]:
#collabourative
from surprise import Reader, Dataset, SVD, evaluate
reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
ratings.head()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])
trainset = data.build_full_trainset()
svd.fit(trainset)
svd.predict(1, 302, 3)



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.9006
MAE:  0.6904
------------
Fold 2
RMSE: 0.8932
MAE:  0.6866
------------
Fold 3
RMSE: 0.8960
MAE:  0.6928
------------
Fold 4
RMSE: 0.8941
MAE:  0.6894
------------
Fold 5
RMSE: 0.9031
MAE:  0.6966
------------
------------
Mean RMSE: 0.8974
Mean MAE : 0.6912
------------
------------


Prediction(uid=1, iid=302, r_ui=3, est=2.756867286395355, details={'was_impossible': False})

# Hybrid systems

Hybrid Systems can take advantage of 
content-based and collaborative filtering as the two approaches are proved to be almost complimentary.

In [12]:
#hybrid one


def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan
links_small = pd.read_csv('links_small.csv')
md = pd.read_csv('movies_metadata.csv')
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i[
    'name'] for i in x] if isinstance(x, list) else [])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(
    lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')


md['id'] = md['id'].apply(convert_int)
md[md['id'].isnull()]
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]

  interactivity=interactivity, compiler=compiler, result=result)


In [21]:

def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan
links_small = pd.read_csv('links_small.csv')
id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')
indices_map = id_map.set_index('id')
def hybrid(userId, title):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    movie_id = id_map.loc[title]['movieId']
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'release_date', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)
hybrid(1, 'Avatar')

Unnamed: 0,title,vote_count,vote_average,release_date,id,est
2149,One Crazy Summer,54.0,6.4,1986-08-08,18282,3.173402
1238,High Noon,343.0,7.6,1952-03-27,288,3.10794
4461,Enemies: A Love Story,11.0,5.7,1989-12-13,116014,2.964289
654,Faithful,4.0,4.8,1996-04-03,47502,2.84995
3720,Steel Magnolias,146.0,7.1,1989-11-15,10860,2.825194
2547,The War of the Worlds,172.0,6.8,1953-08-13,8974,2.82145
746,Heavy,11.0,7.7,1995-01-20,22621,2.811527
3350,Dersu Uzala,90.0,8.0,1975-08-02,9764,2.797765
1613,Alien Escape,2.0,4.5,1996-01-01,29938,2.776404
1020,Looking for Richard,33.0,6.9,1996-10-11,42314,2.744534
