In [34]:
#importing pandas
import pandas as pd
#reading data into variable data
#data found at = https://www.kaggle.com/rounakbanik/movie-recommender-systems/data
data = pd.read_csv("movies_metadata.csv", low_memory=False)

In [35]:
""" 
In order to calculate a score for each movie based on 
their vote average and vote count, we will use a 
weighted rating formula ' WR = (v/(v+m) * R) + (m/(v+m) * C) '
v = number of votes, m = minimum votes required,
R = average rating of the movie, C = mean vote """
# Calculate mean vote 
C = data["vote_average"].mean()
print(C)

5.618207215134185


In [36]:
# Calculate minimum required vote number for filtering
m = data["vote_count"].quantile(0.95)
print(m)

434.0


In [30]:
# Filter data based on the m variable
filtered_data = data.copy().loc[data["vote_count"] >= m]
filtered_data.shape

(2274, 24)

In [37]:
#Define Weighted Rating function
def weighted_rating(dataframe,m=m,C=C):
    v = dataframe["vote_count"]
    R = dataframe["vote_average"]
    return (v/(v+m) * R) + (m/(m+v) * C)

In [44]:
#Apply the weighted rating function to our data and create a new column named score
filtered_data["score"] = filtered_data.apply(weighted_rating, axis=1)

In [50]:
filtered_data.apply(weighted_rating, axis=1)

314      8.357746
834      8.306334
12481    8.208376
2843     8.184899
292      8.172155
351      8.069421
522      8.061007
23673    8.058025
5481     8.035598
1154     8.025793
15480    8.025763
2211     8.014521
18465    8.008265
22879    8.007315
1178     7.997846
7000     7.975624
289      7.962958
1152     7.961165
3030     7.956413
256      7.950652
46       7.930352
1170     7.892593
1176     7.890032
4863     7.889160
2216     7.884722
586      7.883845
5814     7.871988
4099     7.865950
1213     7.850902
1057     7.846863
           ...   
17272    5.175344
12284    5.161495
32891    5.137216
1436     5.135635
6019     5.135165
20598    5.133218
12699    5.122327
16190    5.119476
11445    5.118892
24438    5.116460
6417     5.115539
10010    5.098901
21018    5.089048
13803    5.054632
42096    5.035323
35220    5.019312
30556    5.004458
20791    4.954372
18609    4.949532
15422    4.946903
23110    4.941703
20314    4.888242
1486     4.854756
7939     4.685795
18101    4

In [33]:
#Sort the filtered data by its score value
filtered_data = filtered_data.sort_values("score", ascending=False)
#Display the top 20 movies according to their calculated score
filtered_data[["title", "vote_count","vote_average","score"]].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.357746
834,The Godfather,6024.0,8.5,8.306334
12481,The Dark Knight,12269.0,8.3,8.208376
2843,Fight Club,9678.0,8.3,8.184899
292,Pulp Fiction,8670.0,8.3,8.172155
351,Forrest Gump,8147.0,8.2,8.069421
522,Schindler's List,4436.0,8.3,8.061007
23673,Whiplash,4376.0,8.3,8.058025
5481,Spirited Away,3968.0,8.3,8.035598
1154,The Empire Strikes Back,5998.0,8.2,8.025793
