In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

### V2

In [2]:
movies_df_raw = pd.read_csv('../datasets/movies_metadata.csv')

  movies_df_raw = pd.read_csv('../datasets/movies_metadata.csv')


In [3]:
print(f"Movies columns: {movies_df_raw.columns}")

Movies columns: Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [4]:
movies_columns = ['id', 'title', 'genres', 'popularity', 'vote_average', 'vote_count']
movies_df = movies_df_raw[movies_columns].copy()

In [5]:
# Calculation of the average rating for each movie
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.80)

R = movies_df['vote_average']
v = movies_df['vote_count']

# Calcular o peso do voto usando a fórmula de classificação ponderada
# Esta é a fórmula da média bayesiana:
# classificação ponderada (WR) = (v / (v + m)) * R + (m / (v + m)) * C
# onde:
# R = classificação média para o filme
# v = contagem de votos para o filme
# C = média de votos em todos os filmes
# m = votos mínimos necessários para ser listado (aqui usamos o percentil 80)
movies_df['vote_weight'] = (R * v + C * m) / (v + m)

movies_df.head()

Unnamed: 0,id,title,genres,popularity,vote_average,vote_count,vote_weight
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",21.946943,7.7,5415.0,7.680953
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",17.015539,6.9,2413.0,6.873979
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",11.7129,6.5,92.0,6.18951
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.859495,6.1,34.0,5.813219
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",8.387519,5.7,173.0,5.681661


In [6]:
movies_df['popularity'] = pd.to_numeric(movies_df['popularity'], errors='coerce').fillna(0)

In [7]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(movies_df[['popularity', 'vote_weight']])
movies_scaled_df = pd.DataFrame(scaled, columns=['popularity', 'vote_weight'])
movies_scaled_df['id'] = movies_df['id']
movies_scaled_df.head()

Unnamed: 0,popularity,vote_weight,id
0,0.040087,0.812253,862
1,0.031079,0.683222,8844
2,0.021394,0.573779,15602
3,0.007049,0.513612,31357
4,0.01532,0.492577,11862


In [8]:
# Adding more weight to the popularity when calc the score
movies_scaled_df['score'] = movies_scaled_df['vote_weight'] * 0.4 + movies_scaled_df['popularity'] * 0.6

In [9]:
top_10_movies = movies_scaled_df.sort_values(by='score', ascending=False).head(10)
top_10_movies

Unnamed: 0,popularity,vote_weight,id,score
30700,1.0,0.606128,211672,0.842451
33356,0.537613,0.732859,297762,0.615712
42222,0.524675,0.6697,321612,0.582685
24455,0.390602,0.828536,177572,0.565775
43644,0.416507,0.729422,339403,0.541673
26566,0.338511,0.796081,283995,0.521539
292,0.257449,0.908776,680,0.51798
26564,0.343132,0.766091,293660,0.512315
23675,0.282748,0.844273,210577,0.507358
12481,0.224968,0.909494,155,0.498778


In [10]:
top_10_movies_details = top_10_movies.merge(movies_df, on='id', how='left')

for index, row in top_10_movies.iterrows():
    movie_detail = top_10_movies_details[top_10_movies_details['id'] == row['id']]
    print(f"Title: {movie_detail['title'].values[0]}. Score: {row['score']}.")

Title: Minions. Score: 0.842451046393174.
Title: Wonder Woman. Score: 0.6157117081931772.
Title: Beauty and the Beast. Score: 0.58268526610886.
Title: Big Hero 6. Score: 0.565775474646886.
Title: Baby Driver. Score: 0.5416731504614627.
Title: Guardians of the Galaxy Vol. 2. Score: 0.5215390437551555.
Title: Pulp Fiction. Score: 0.5179797763629913.
Title: Deadpool. Score: 0.5123151381315753.
Title: Gone Girl. Score: 0.5073578806636065.
Title: The Dark Knight. Score: 0.4987784941949063.
