In [None]:
# DBTITLE 1,Definir os caminhos
# Onde o arquivo CSV foi carregado. Ajuste se necessário.
landing_zone_path = "/Volumes/workspace/default/landing/top_100_movies_full_best_effort.csv" 
bronze_path = "/mnt/bronze/movies_db"

# Criar os bancos de dados se não existirem
spark.sql("CREATE DATABASE IF NOT EXISTS bronze")

In [None]:
# DBTITLE 1,Carregar o CSV em um DataFrame
df_movies_raw = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load(landing_zone_path)

display(df_movies_raw)

In [None]:
# DBTITLE 1,Tabela 1: Filmes (Movies)
from pyspark.sql.functions import col

df_movies = df_movies_raw.select(
    col("Rank").alias("rank"),
    col("Title").alias("title"),
    col("Year").alias("year"),
    col("Runtime (mins)").alias("runtime_mins"),
    col("IMDb Rating").alias("imdb_rating"),
    col("Rotten Tomatoes %").alias("rotten_tomatoes_pct"),
    col("Metacritic Score").alias("metacritic_score"),
    col("Oscars Won").alias("oscars_won"),
    col("Box Office ($M)").alias("box_office_million")
)

# Salva como Tabela Delta
df_movies.write.format("delta").mode("overwrite").saveAsTable("bronze.movies")

display(df_movies)

In [None]:
# DBTITLE 1,Tabela 2: Gêneros (Genres)
from pyspark.sql.functions import explode, split, col

df_genres = df_movies_raw.select(
    col("Rank").alias("movie_rank"),
    explode(split(col("Genre(s)"), "\\|")).alias("genre")
)

df_genres.write.format("delta").mode("overwrite").saveAsTable("bronze.genres")

display(df_genres)

In [None]:
# DBTITLE 1,Tabela 3: Diretores (Directors)
from pyspark.sql.functions import col

df_directors = df_movies_raw.select(
    col("Rank").alias("movie_rank"),
    col("Director").alias("director_name")
)

df_directors.write.format("delta").mode("overwrite").saveAsTable("bronze.directors")

display(df_directors)

In [None]:
# DBTITLE 1,Tabela 4: Atores (Actors)
from pyspark.sql.functions import explode, split, col

df_actors = df_movies_raw.select(
    col("Rank").alias("movie_rank"),
    explode(split(col("Main Actor(s)"), "\\|")).alias("actor_name")
)

df_actors.write.format("delta").mode("overwrite").saveAsTable("bronze.actors")

display(df_actors)

In [None]:
# DBTITLE 1,Tabela 5: Países (Countries)
from pyspark.sql.functions import explode, split, col

df_countries = df_movies_raw.select(
    col("Rank").alias("movie_rank"),
    explode(split(col("Country"), "\\|")).alias("country_name")
)

df_countries.write.format("delta").mode("overwrite").saveAsTable("bronze.countries")

display(df_countries)