In [None]:
# Databricks notebook source

# MAGIC %md
# MAGIC # Camada Gold
# MAGIC
# MAGIC Criando tabelas de fatos e dimensões para análise e BI.

# COMMAND ----------

# DBTITLE 1,Definir o caminho da Camada Gold
gold_path = "/mnt/gold/movies_db"

# COMMAND ----------

# DBTITLE 1,Carregar as tabelas da camada Silver
df_movies_silver = spark.read.format("delta").table("silver.movies")
df_genres_silver = spark.read.format("delta").table("silver.genres")
df_directors_silver = spark.read.format("delta").table("silver.directors")
df_actors_silver = spark.read.format("delta").table("silver.actors")
df_countries_silver = spark.read.format("delta").table("silver.countries")

# COMMAND ----------

# DBTITLE 1,Dimensão: Filmes (dim_movies)
dim_movies = df_movies_silver.select("rank", "title", "year", "runtime_mins")
dim_movies.write.format("delta").mode("overwrite").saveAsTable("gold.dim_movies")
display(dim_movies)

# COMMAND ----------

# DBTITLE 1,Fato: Avaliações e Bilheteria (fact_ratings_boxoffice)
fact_ratings = df_movies_silver.select("rank", "imdb_rating", "rotten_tomatoes_pct", "metacritic_score", "oscars_won", "box_office_million")
fact_ratings.write.format("delta").mode("overwrite").saveAsTable("gold.fact_ratings_boxoffice")
display(fact_ratings)

# COMMAND ----------

# DBTITLE 1,Visão Agregada: Média de Avaliações por Gênero
df_avg_ratings_by_genre = df_genres_silver.join(df_movies_silver, df_genres_silver.movie_rank == df_movies_silver.rank) \
    .groupBy("genre") \
    .agg({"imdb_rating": "avg", "rotten_tomatoes_pct": "avg", "metacritic_score": "avg"}) \
    .withColumnRenamed("avg(imdb_rating)", "avg_imdb_rating") \
    .withColumnRenamed("avg(rotten_tomatoes_pct)", "avg_rotten_tomatoes_pct") \
    .withColumnRenamed("avg(metacritic_score)", "avg_metacritic_score")

df_avg_ratings_by_genre.write.format("delta").mode("overwrite").saveAsTable("gold.avg_ratings_by_genre")
display(df_avg_ratings_by_genre)