In [0]:

spark.sql("USE CATALOG workspace")
spark.sql("USE SCHEMA default")

display(spark.sql("SELECT current_catalog(), current_schema()"))


In [0]:
# TRAIN
df_train_ops_bronze   = spark.table("bronze_train_operational_readouts")
df_train_tte_bronze   = spark.table("bronze_train_tte")
df_train_specs_bronze = spark.table("bronze_train_specifications")

# VALIDATION
df_val_ops_bronze     = spark.table("bronze_validation_operational_readouts")
df_val_labels_bronze  = spark.table("bronze_validation_labels")
df_val_specs_bronze   = spark.table("bronze_validation_specifications")

# TEST
df_test_ops_bronze    = spark.table("bronze_test_operational_readouts")
df_test_labels_bronze = spark.table("bronze_test_labels")        # se existir no pacote
df_test_specs_bronze  = spark.table("bronze_test_specifications")



In [0]:
df_train_ops_bronze.printSchema()
df_train_tte_bronze.printSchema()
df_train_specs_bronze.printSchema()

In [0]:
from pyspark.sql.functions import col, trim, lower

def normalize_vehicle_id(df, colname="vehicle_id"):
    """
    Normaliza o identificador de veículo:
      - trim: remove espaços nas pontas
      - lower: converte para minúsculo
    """
    return df.withColumn(colname, lower(trim(col(colname))))

In [0]:
from pyspark.sql.functions import lit , col

# Normalizar vehicle_id em cada split
df_train_ops_s = normalize_vehicle_id(df_train_ops_bronze)
df_val_ops_s   = normalize_vehicle_id(df_val_ops_bronze)
df_test_ops_s  = normalize_vehicle_id(df_test_ops_bronze)

# Garantir que time_step é inteiro (caso tenha vindo como long/double/string)
df_train_ops_s = df_train_ops_s.withColumn("time_step", col("time_step").cast("int"))
df_val_ops_s   = df_val_ops_s.withColumn("time_step", col("time_step").cast("int"))
df_test_ops_s  = df_test_ops_s.withColumn("time_step", col("time_step").cast("int"))

# Adicionar coluna dataset_split
df_train_ops_s = df_train_ops_s.withColumn("dataset_split", lit("train"))
df_val_ops_s   = df_val_ops_s.withColumn("dataset_split", lit("validation"))
df_test_ops_s  = df_test_ops_s.withColumn("dataset_split", lit("test"))

# Unificar em uma única tabela Silver
df_ops_silver_all = (
    df_train_ops_s
    .unionByName(df_val_ops_s)
    .unionByName(df_test_ops_s)
)

df_ops_silver_all.printSchema()
display(df_ops_silver_all.limit(5))


In [0]:
df_ops_silver_all.write.mode("overwrite").format("delta").saveAsTable("silver_operational_readouts")

print("Tabela silver_operational_readouts criada!")


In [0]:
# Normalizar vehicle_id
df_train_specs_s = normalize_vehicle_id(df_train_specs_bronze)
df_val_specs_s   = normalize_vehicle_id(df_val_specs_bronze)
df_test_specs_s  = normalize_vehicle_id(df_test_specs_bronze)

# Adicionar dataset_split
df_train_specs_s = df_train_specs_s.withColumn("dataset_split", lit("train"))
df_val_specs_s   = df_val_specs_s.withColumn("dataset_split", lit("validation"))
df_test_specs_s  = df_test_specs_s.withColumn("dataset_split", lit("test"))

# Unificar especificações
df_specs_silver_all = (
    df_train_specs_s
    .unionByName(df_val_specs_s)
    .unionByName(df_test_specs_s)
)

df_specs_silver_all.printSchema()
display(df_specs_silver_all.limit(5))

# Salvar como Silver
df_specs_silver_all.write.mode("overwrite").format("delta").saveAsTable("silver_specifications")




In [0]:
from pyspark.sql.types import IntegerType

# Garantir tipos inteiros
df_train_tte_s = (
    df_train_tte_bronze
    .withColumn("length_of_study_time_step", col("length_of_study_time_step").cast(IntegerType()))
    .withColumn("in_study_repair",         col("in_study_repair").cast(IntegerType()))
)

df_train_tte_s.printSchema()
display(df_train_tte_s.limit(5))

# Salvar como Silver
df_train_tte_s.write.mode("overwrite").format("delta").saveAsTable("silver_train_tte")




In [0]:
# Normalizar vehicle_id
df_val_labels_s  = normalize_vehicle_id(df_val_labels_bronze)
df_test_labels_s = normalize_vehicle_id(df_test_labels_bronze)

# Garantir que class_label é inteiro
df_val_labels_s  = df_val_labels_s.withColumn("class_label", col("class_label").cast("int"))
df_test_labels_s = df_test_labels_s.withColumn("class_label", col("class_label").cast("int"))

# Adicionar dataset_split
df_val_labels_s  = df_val_labels_s.withColumn("dataset_split", lit("validation"))
df_test_labels_s = df_test_labels_s.withColumn("dataset_split", lit("test"))

# Unificar labels
df_labels_silver_all = df_val_labels_s.unionByName(df_test_labels_s)

df_labels_silver_all.printSchema()
display(df_labels_silver_all.limit(5))

# Salvar como Silver
df_labels_silver_all.write.mode("overwrite").format("delta").saveAsTable("silver_labels")




In [0]:
spark.sql("SHOW TABLES IN workspace.default").show(100)

In [0]:
df_ops_silver = spark.table("silver_operational_readouts")
df_specs_siver = spark.table("silver_specifications")
df_labels_silver = spark.table("silver_labels")

display(df_ops_silver.limit(5))
display(df_specs_siver.limit(5))
display(df_labels_silver.limit(5))
