In [51]:
#importing pandas
import pandas as pd
#reading data into variable data
#data found at = https://www.kaggle.com/rounakbanik/movie-recommender-systems/data
data = pd.read_csv("movies_metadata.csv", low_memory=False)

In [52]:
""" 
In order to calculate a score for each movie based on 
their vote average and vote count, we will use a 
weighted rating formula ' WR = (v/(v+m) * R) + (m/(v+m) * C) '
v = number of votes, m = minimum votes required,
R = average rating of the movie, C = mean vote 
"""
# Calculate mean vote 
C = data["vote_average"].mean()
print(C)

5.618207215134185


In [53]:
# Calculate minimum required vote number for filtering
m = data["vote_count"].quantile(0.95)
print(m)

434.0


In [54]:
# Filter data based on the m variable
filtered_data = data.copy().loc[data["vote_count"] >= m]
filtered_data.shape

(2274, 24)

In [55]:
#Define Weighted Rating function
def weighted_rating(dataframe,m=m,C=C):
    v = dataframe["vote_count"]
    R = dataframe["vote_average"]
    return (v/(v+m) * R) + (m/(m+v) * C)

In [56]:
#Apply the weighted rating function to our data and create a new column named score
filtered_data["score"] = filtered_data.apply(weighted_rating, axis=1)

In [57]:
#Sort the filtered data by its score value
filtered_data = filtered_data.sort_values("score", ascending=False)
#Display the top 20 movies according to their calculated score
filtered_data[["title", "vote_count","vote_average","score"]].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.357746
834,The Godfather,6024.0,8.5,8.306334
12481,The Dark Knight,12269.0,8.3,8.208376
2843,Fight Club,9678.0,8.3,8.184899
292,Pulp Fiction,8670.0,8.3,8.172155
351,Forrest Gump,8147.0,8.2,8.069421
522,Schindler's List,4436.0,8.3,8.061007
23673,Whiplash,4376.0,8.3,8.058025
5481,Spirited Away,3968.0,8.3,8.035598
1154,The Empire Strikes Back,5998.0,8.2,8.025793
