In [None]:
import pandas as pd 
import numpy as np

# Load the datasets
df1=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df2=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')

In [None]:
# Join the two dataset on the 'id' column
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')

In [None]:
df2.head()

Demographic Filtering
---------------------

In [None]:
C= df2['vote_average'].mean()
C

In [None]:
m= df2['vote_count'].quantile(0.9)
m

In [None]:
q_movies = df2.copy().loc[df2['vote_count'] >= m]
q_movies.shape

IMDB's weighted rating (wr)
https://tutorialedge.net/python/building-imdb-top-250-clone-pandas/#the-metric

- v is the number of votes for the movie;
- m is the minimum votes required to be listed in the chart;
- R is the average rating of the movie; And
- C is the mean vote across the whole report

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

# Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score', 'popularity']].head(15)

In [None]:
plt.figure(figsize=(12,4))

plt.barh(q_movies['title'].head(15), q_movies['score'].head(15), align='center', color='orange')
plt.gca().invert_yaxis()
plt.xlabel("Score")
plt.title("Trending Movies")

In [None]:
pop= df2.sort_values('popularity', ascending=False)

plt.figure(figsize=(12,4))

plt.barh(pop['title'].head(15), pop['popularity'].head(15), align='center', color='lightgreen')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

Content Based Filtering
-----------------------

In this recommender system the content of the movie (overview, cast, crew, keyword, tagline etc) is used to find its similarity with other movies. Then the movies that are most likely to be similar are recommended.

In [None]:
df2['overview'].head(10)

In [None]:
# Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#,Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

# Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

# Output the shape of tfidf_matrix
tfidf_matrix.shape

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    try:
        # Get the index of the movie that matches the title
        idx = indices[title]
    except:
        return 'Cannot find the index of the movie that matches the title'

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    try:
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    except:
        pass

    # Get the scores of the 10 most similar movies
    if len(sim_scores) > 11:
        sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2[['title', 'release_date', 'id']].iloc[movie_indices]

In [None]:
get_recommendations('Spider-Man')

In [None]:
get_recommendations('Superman')