# Conexão com o drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preparação do ambiente

In [2]:
!pip install -q findspark pyspark

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("featureengineering") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
print("Spark configurado corretamente!")

Spark configurado corretamente!


# Bibliotecas

In [4]:
import os
import sys
import pytz
import numpy as np
import pandas as pd
import builtins
import datetime
import pyspark.sql.types as T
from datetime import datetime
from datetime import timedelta
from datetime import date
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, lpad, translate
from pyspark.sql.types import *
from pyspark.sql.functions import count, avg, col, to_date, trim, when,sum, min,to_timestamp, year, date_format,split, lpad, concat_ws
from pyspark.sql.functions import max as Fmax, add_months, lit
from pyspark.sql import Window
from pyspark.sql.functions import trunc, lead, count as Fcount
from pyspark.sql import functions as F

# Leitura e Governança dos Dados

In [5]:
tz       = pytz.timezone("Europe/Lisbon")
agora    = datetime.now(tz)
dthproc  = agora.strftime("%Y%m%d%H%M%S")
file     = f'previsao_demanda_{dthproc}.csv'
ts_file_generation = file.split("_")[-1].replace('.csv','') + '00'
print("ts_file_generation =", ts_file_generation)

ts_file_generation = 2025061915243000


In [28]:
base_dir        = "/content/drive/MyDrive/PREVISAO_DEMANDA/RAW"
train_dir  = os.path.join(base_dir, "DADOS")
lake_dir        = "/content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED"
controle_dir    = os.path.join(lake_dir, "CONTROLE_TRUSTED", f"trusted_tb_controle_procesamento_{dthproc}")
metadados_dir   = os.path.join(lake_dir, "METADADOS_TRUSTED")

In [7]:
path_input = train_dir
df_demanda = (
    spark.read
         .option("basePath", train_dir)
         .parquet(path_input)
)

df_demanda.createOrReplaceTempView("df_demanda")
df_demanda.cache()
qtd = df_demanda.count()
print("records df_demanda:", qtd)

records df_demanda: 1036696


In [30]:
controle = spark.sql(f"""
  SELECT
    '{file}'               AS name_file,
    {qtd}                  AS qtd_records,
    '{dthproc}'            AS ts_proc,
    '{ts_file_generation}' AS ts_file_generation
""")

controle.write \
    .mode("append") \
    .partitionBy("ts_proc") \
    .parquet(controle_dir)
print("Controle gravado em:", controle_dir)

Controle gravado em: /content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/CONTROLE_TRUSTED/trusted_tb_controle_procesamento_20250619152430


# Metadados inicial

In [9]:
def generate_metadados(dataframe, df_name: str):
    """
    Gera metadados de `dataframe` e adiciona em todas as linhas
    uma coluna 'Warehouse' com o nome do DataFrame de origem.
    """
    # 1) coleta as estatísticas originais
    column_names  = dataframe.columns
    dtypes        = [dtype for _, dtype in dataframe.dtypes]
    total_count   = dataframe.count()
    null_counts   = [dataframe.filter(F.col(c).isNull()).count() for c in column_names]
    null_percents = [round((cnt/total_count)*100, 2) for cnt in null_counts]
    cardinality   = [dataframe.select(c).distinct().count() for c in column_names]

    # monta o DataFrame de metadados
    metadata = spark.createDataFrame(
        zip(column_names, dtypes, null_counts, null_percents, cardinality),
        schema=["nome_variavel", "tipo", "qt_nulos", "percent_nulos", "cardinalidade"]
    ).orderBy(F.desc("percent_nulos"))

    # Coluna de identificação
    metadata = metadata.withColumn("Warehouse", F.lit(df_name))

    # Warehouse é para que o nome do df seja a primeira coluna
    cols = ["Warehouse"] + [c for c in metadata.columns if c != "Warehouse"]
    return metadata.select(*cols)

In [10]:
md_demanda = generate_metadados(df_demanda, "df_demanda")
md_demanda.show(truncate=False)

+----------+----------------+------+--------+-------------+-------------+
|Warehouse |nome_variavel   |tipo  |qt_nulos|percent_nulos|cardinalidade|
+----------+----------------+------+--------+-------------+-------------+
|df_demanda|Order_Demand    |int   |5859    |0.57         |3281         |
|df_demanda|Product_Code    |int   |0       |0.0          |2160         |
|df_demanda|year            |int   |0       |0.0          |6            |
|df_demanda|Warehouse       |string|0       |0.0          |4            |
|df_demanda|month_reference |string|0       |0.0          |61           |
|df_demanda|Product_Category|int   |0       |0.0          |33           |
|df_demanda|Date            |date  |0       |0.0          |1687         |
+----------+----------------+------+--------+-------------+-------------+



# Separando os DF's por warehouse

In [11]:
# warehouses a processar
warehouses = ['J', 'A', 'S', 'C']
dfs = {}

for wh in warehouses:
    view_name = f"df_demanda_{wh.lower()}"

    # Filtra direto na view SQL
    df_wh = spark.sql(f"""
        SELECT *
        FROM df_demanda
        WHERE warehouse = '{wh}'
    """)

    # Cache para performance
    df_wh.cache()

    # Registra como view para consultas futuras
    df_wh.createOrReplaceTempView(view_name)

    # Conta e imprime
    total = df_wh.count()
    print(f"records {view_name}: {total}")

    # Armazena no dict para uso
    dfs[view_name] = df_wh

records df_demanda_j: 764254
records df_demanda_a: 142197
records df_demanda_s: 88026
records df_demanda_c: 42219


In [12]:
df_demanda_j = dfs['df_demanda_j']
df_demanda_a = dfs['df_demanda_a']
df_demanda_s = dfs['df_demanda_s']
df_demanda_c = dfs['df_demanda_c']

df_demanda_j.createOrReplaceTempView("df_demanda_j")
df_demanda_a.createOrReplaceTempView("df_demanda_a")
df_demanda_s.createOrReplaceTempView("df_demanda_s")
df_demanda_c.createOrReplaceTempView("df_demanda_c")

# Chack de sanidade


In [13]:
# 4 dataframes
df_names = [
    "df_demanda_j",
    "df_demanda_a",
    "df_demanda_s",
    "df_demanda_c"
]

# Para cada nome, busca a variável no namespace global e imprime o tamanho
for name in df_names:
    df = globals().get(name)
    if df is not None:
        print(f"{name}: {len(df.columns)} colunas")
    else:
        print(f"{name} não encontrado!")


df_demanda_j: 7 colunas
df_demanda_a: 7 colunas
df_demanda_s: 7 colunas
df_demanda_c: 7 colunas


In [14]:
from pyspark.sql.functions import min as Fmin

# Lista com os 4 DataFrames
df_names = [
    "df_demanda_j",
    "df_demanda_a",
    "df_demanda_s",
    "df_demanda_c"
]

for name in df_names:
    df = globals().get(name)
    if df is not None:
        print(f"=== {name} ===")
        df.agg(
            Fmin("Date").alias("min_date"),
            Fmax("Date").alias("max_date")
        ).show()
    else:
        print(f"{name} não encontrado no namespace global.")


=== df_demanda_j ===
+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2012-01-04|2017-01-06|
+----------+----------+

=== df_demanda_a ===
+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2012-01-02|2016-12-30|
+----------+----------+

=== df_demanda_s ===
+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2012-01-01|2017-01-03|
+----------+----------+

=== df_demanda_c ===
+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2012-01-03|2017-01-09|
+----------+----------+



* Olhando as datas máximas, vejo que o warehouse a, possui o ultimo registro em 30/12/2016, enquanto os outros possuem registro no inicio do mês subsequente 01/2017, porém para não atrapalhar a criação do target e ter na minha base de validação periodos diferentes, vou retirar os registros de 01/2017.

Ou seja, meu ultimo target será em 11/2016.


# Retirando o mês 01/2017 do dataset.

In [15]:
# nomes dos 4 DataFrames
df_names = [
    "df_demanda_j",
    "df_demanda_a",
    "df_demanda_s",
    "df_demanda_c"
]

for name in df_names:
    df = globals().get(name)
    if df is None:
        print(f"{name} não encontrado!")
        continue

    # conta antes do filtro
    orig_count = df.count()

    # aplica o filtro
    df_filtered = df.filter(col("month_reference") != "201701")

    # conta depois
    new_count = df_filtered.count()

    # calcula removidos
    removed = orig_count - new_count

    # reatribui no namespace
    globals()[name] = df_filtered

    # imprime resultados
    print(f"{name}: {orig_count} registros originais, {new_count} após filtro, {removed} removidos.")


df_demanda_j: 764254 registros originais, 764215 após filtro, 39 removidos.
df_demanda_a: 142197 registros originais, 142197 após filtro, 0 removidos.
df_demanda_s: 88026 registros originais, 88024 após filtro, 2 removidos.
df_demanda_c: 42219 registros originais, 42207 após filtro, 12 removidos.


# METADADOS dos DF's separados.

In [26]:
for wh in ['j','a','s','c']:
    df = globals().get(f"df_demanda_{wh}")
    if df is None:
        continue
    md = generate_metadados(df, f"df_demanda_{wh}")
    md.show(truncate=False)

+------------+----------------+------+--------+-------------+-------------+
|Warehouse   |nome_variavel   |tipo  |qt_nulos|percent_nulos|cardinalidade|
+------------+----------------+------+--------+-------------+-------------+
|df_demanda_j|Order_Demand    |int   |3547    |0.46         |1644         |
|df_demanda_j|Product_Code    |int   |0       |0.0          |1625         |
|df_demanda_j|year            |int   |0       |0.0          |5            |
|df_demanda_j|Warehouse       |string|0       |0.0          |1            |
|df_demanda_j|month_reference |string|0       |0.0          |60           |
|df_demanda_j|Product_Category|int   |0       |0.0          |25           |
|df_demanda_j|Date            |date  |0       |0.0          |1394         |
+------------+----------------+------+--------+-------------+-------------+

+------------+----------------+------+--------+-------------+-------------+
|Warehouse   |nome_variavel   |tipo  |qt_nulos|percent_nulos|cardinalidade|
+----------

In [29]:
path_metadados = os.path.join(metadados_dir, f"tb_metadados_por_df_inicial_trusted_{dthproc}")

md = md.withColumn("dthproc", lit(dthproc))

md.write \
    .mode("append") \
    .partitionBy("dthproc") \
    .parquet(path_metadados)
print("Metadados gravados em:", path_metadados)

Metadados gravados em: /content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/METADADOS_TRUSTED/tb_metadados_por_df_inicial_trusted_20250619152430


# Salvando os DF's separados.


In [31]:
# Diretório base onde ficarão as pastas com cada DF
base_dados = "/content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/DADOS"

dfs = {
   'df_demanda_j': df_demanda_j,
   'df_demanda_a': df_demanda_a,
   'df_demanda_s': df_demanda_s,
   'df_demanda_c': df_demanda_c
 }

for name, df in dfs.items():
    output_path = f"{base_dados}/{name}"
    df.write \
      .mode("overwrite") \
      .parquet(output_path)
    print(f"Gravado com sucesso em: {output_path}")

Gravado com sucesso em: /content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/DADOS/df_demanda_j
Gravado com sucesso em: /content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/DADOS/df_demanda_a
Gravado com sucesso em: /content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/DADOS/df_demanda_s
Gravado com sucesso em: /content/drive/MyDrive/PREVISAO_DEMANDA/TRUSTED/DADOS/df_demanda_c


### Agora tenho em memória:
* dfs = df_demanda_j, df_demanda_a, df_demanda_s, df_demanda_c.
Para fazer a engineering separada para cada warehouse, pois serão um modelo pra cada um.

# Criação do Target para todos os DF's.

In [32]:
# Lista das warehouses
warehouses = ['j','a','s','c']

for wh in warehouses:
    # Seleciona o DF de entrada
    df_input = globals()[f"df_demanda_{wh}"]

    # Agregação mensal
    df_monthly = (
        df_input
          .withColumn("month_dt", trunc(col("Date"), "month"))
          .groupBy("Product_Code", "month_dt")
          .agg(Fcount("*").alias("count_demand"))
          .orderBy("Product_Code", "month_dt")
    )

    # Cria coluna target = demanda do próximo mês
    w_prod = Window.partitionBy("Product_Code").orderBy("month_dt")
    df_monthly = df_monthly.withColumn("target", lead("count_demand", 1).over(w_prod))

    # Registra como view temporária para uso
    df_monthly.createOrReplaceTempView(f"df_monthly_{wh}")

    print(f"✔ df_monthly_{wh} criado com target")


✔ df_monthly_j criado com target
✔ df_monthly_a criado com target
✔ df_monthly_s criado com target
✔ df_monthly_c criado com target


In [33]:
# warehouses
warehouses = ['j','a','s','c']

for wh in warehouses:
    # df_monthly gerado no Bloco 1
    df_monthly = spark.table(f"df_monthly_{wh}")

    # Definir o ponto de corte: últimos 4 meses para validação
    max_month  = df_monthly.agg(Fmax("month_dt").alias("max_month")) \
                          .collect()[0]["max_month"]
    val_start  = add_months(lit(max_month), -4)

    # Filtra em treino e validação
    df_train = df_monthly.filter(col("month_dt") < val_start)
    df_val   = df_monthly.filter(col("month_dt") >= val_start)

    # Registra como views
    df_train.createOrReplaceTempView(f"df_{wh}_train")
    df_val.createOrReplaceTempView(f"df_{wh}_val")

    # Saída
    print(f"=== Warehouse {wh.upper()} ===")
    print(" TRAIN:", df_train.count(), " registros")
    print("  VAL :", df_val.count(), " registros")


=== Warehouse J ===
 TRAIN: 74293  registros
  VAL : 7342  registros
=== Warehouse A ===
 TRAIN: 18220  registros
  VAL : 1819  registros
=== Warehouse S ===
 TRAIN: 21910  registros
  VAL : 1285  registros
=== Warehouse C ===
 TRAIN: 9649  registros
  VAL : 1100  registros


# Criação de features para todos os DF's.

In [34]:
from pyspark.sql import Window
from pyspark.sql.functions import (
    col,trunc,month,sin,cos,expr,sum as Fsum,
    count as Fcount,
    avg as Favg,
    stddev as Fstd,
    lag,lead,when,last,months_between
)

def add_features(df):
    """
    Recebe um DataFrame com colunas:
      - Product_Code
      - month_dt      (Date truncado ao primeiro dia do mês)
      - count_demand  (contagem de ocorrências naquele mês)
      - target        (opcional: demanda do mês seguinte)
    Devolve o mesmo DF acrescido de dezenas de features:
      * lags, diffs e ratios 1m / 12m
      * rolling sum/avg/std/median em janelas de 3,6,9,12 meses
      * lag/diff/ratio para cada estatística acima
      * zero_count em cada janela
      * momentum de razão (Δratio_1m)
      * sazonalidades: ciclos 12m, 6m, 4m
      * months_since_last: meses desde a última venda
    """
    df = df.cache()

    # janela ordenada por mês para cada produto
    w = Window.partitionBy("Product_Code").orderBy("month_dt")
    # mesma janela, mas acumulando desde o início (para last non-zero)
    w_unb = w.rowsBetween(Window.unboundedPreceding, -1)

    # 1) time-since-last-sale
    df = df.withColumn(
        "last_sale_month",
        last(when(col("count_demand") > 0, col("month_dt")), True).over(w_unb)
    ).withColumn(
        "months_since_last",
        months_between(col("month_dt"), col("last_sale_month"))
    )

    # 2) lags, diffs e ratios 1m e 12m
    df = (
        df
        .withColumn("lag_1m", lag("count_demand", 1).over(w))
        .withColumn("diff_1m", col("count_demand") - col("lag_1m"))
        .withColumn("ratio_1m", when(col("lag_1m") != 0,
                                     col("count_demand") / col("lag_1m")))

        .withColumn("lag_12m", lag("count_demand", 12).over(w))
        .withColumn("diff_12m", col("count_demand") - col("lag_12m"))
        .withColumn("ratio_12m", when(col("lag_12m") != 0,
                                      col("count_demand") / col("lag_12m")))
    )

    # 3) momentum de razão: Δratio_1m = ratio_1m – ratio_2m
    df = (
        df
        .withColumn("lag_2m", lag("count_demand", 2).over(w))
        .withColumn("ratio_2m", when(col("lag_2m") != 0,
                                     col("count_demand") / col("lag_2m")))
        .withColumn("delta_ratio_1m", col("ratio_1m") - col("ratio_2m"))
    )

    # 4) rolling windows 3,6,9,12 meses
    for k in [3, 6, 9, 12]:
        w_k = w.rowsBetween(-k + 1, 0)

        df = (
            df
            # estatísticas na janela atual
            .withColumn(f"sum_{k}m",  Fsum("count_demand").over(w_k))
            .withColumn(f"avg_{k}m",  Favg("count_demand").over(w_k))
            .withColumn(f"std_{k}m",  Fstd("count_demand").over(w_k))
            .withColumn(
                f"median_{k}m",
                expr(
                    f"percentile_approx(count_demand, 0.5) "
                    f"OVER (PARTITION BY Product_Code ORDER BY month_dt "
                    f"ROWS BETWEEN {-(k-1)} PRECEDING AND CURRENT ROW)"
                )
            )
            # contagem de meses com zero vendas
            .withColumn(
                f"zero_count_{k}m",
                Fsum(when(col("count_demand") == 0, 1).otherwise(0)).over(w_k)
            )
            # lags das estatísticas
            .withColumn(f"lag_sum_{k}m",    lag(f"sum_{k}m",    1).over(w))
            .withColumn(f"lag_avg_{k}m",    lag(f"avg_{k}m",    1).over(w))
            .withColumn(f"lag_std_{k}m",    lag(f"std_{k}m",    1).over(w))
            .withColumn(f"lag_med_{k}m",    lag(f"median_{k}m", 1).over(w))
            # diferenças absolutas
            .withColumn(f"diff_sum_{k}m",   col(f"sum_{k}m")    - col(f"lag_sum_{k}m"))
            .withColumn(f"diff_avg_{k}m",   col(f"avg_{k}m")    - col(f"lag_avg_{k}m"))
            .withColumn(f"diff_std_{k}m",   col(f"std_{k}m")    - col(f"lag_std_{k}m"))
            .withColumn(f"diff_med_{k}m",   col(f"median_{k}m") - col(f"lag_med_{k}m"))
            # razões (crescimento relativo)
            .withColumn(
                f"ratio_sum_{k}m",
                when(col(f"lag_sum_{k}m") != 0,
                     col(f"sum_{k}m") / col(f"lag_sum_{k}m"))
            )
            .withColumn(
                f"ratio_avg_{k}m",
                when(col(f"lag_avg_{k}m") != 0,
                     col(f"avg_{k}m") / col(f"lag_avg_{k}m"))
            )
            .withColumn(
                f"ratio_std_{k}m",
                when(col(f"lag_std_{k}m") != 0,
                     col(f"std_{k}m") / col(f"lag_std_{k}m"))
            )
            .withColumn(
                f"ratio_med_{k}m",
                when(col(f"lag_med_{k}m") != 0,
                     col(f"median_{k}m") / col(f"lag_med_{k}m"))
            )
        )

    # 5) sazonalidades múltiplas (12, 6 e 4 meses)
    df = (
        df
        .withColumn("month_num", month(col("month_dt")))
        .withColumn("sin_12m", sin(expr("2 * PI() * month_num / 12")))
        .withColumn("cos_12m", cos(expr("2 * PI() * month_num / 12")))
        .withColumn("sin_6m",  sin(expr("2 * PI() * month_num / 6")))
        .withColumn("cos_6m",  cos(expr("2 * PI() * month_num / 6")))
        .withColumn("sin_4m",  sin(expr("2 * PI() * month_num / 4")))
        .withColumn("cos_4m",  cos(expr("2 * PI() * month_num / 4")))
        .drop("month_num", "last_sale_month", "lag_2m", "ratio_2m")
    )

    return df

In [35]:
# Warehouses a processar
warehouses = ['j','a','s','c']

for wh in warehouses:
    # DataFrames de treino/val via view SQL
    df_tr = spark.table(f"df_{wh}_train")
    df_va = spark.table(f"df_{wh}_val")

    # Aplica a função de feature engineering
    df_tr_feat = add_features(df_tr)
    df_va_feat = add_features(df_va)

    # Guarda no namespace para usar depois
    globals()[f"df_train_warehouse_{wh}"] = df_tr_feat
    globals()[f"df_val_warehouse_{wh}"]   = df_va_feat

    # Registra também como view SQL
    df_tr_feat.createOrReplaceTempView(f"df_train_warehouse_{wh}")
    df_va_feat.createOrReplaceTempView(f"df_val_warehouse_{wh}")

    # Feedback rápido
    print(f"=== Warehouse {wh.upper()} ===")
    print(" TRAIN_SCHEMA:")
    df_tr_feat.printSchema()
    print(" VALIDATION sample:")
    df_va_feat.show(3, truncate=False)


=== Warehouse J ===
 TRAIN_SCHEMA:
root
 |-- Product_Code: integer (nullable = true)
 |-- month_dt: date (nullable = true)
 |-- count_demand: long (nullable = false)
 |-- target: long (nullable = true)
 |-- months_since_last: double (nullable = true)
 |-- lag_1m: long (nullable = true)
 |-- diff_1m: long (nullable = true)
 |-- ratio_1m: double (nullable = true)
 |-- lag_12m: long (nullable = true)
 |-- diff_12m: long (nullable = true)
 |-- ratio_12m: double (nullable = true)
 |-- delta_ratio_1m: double (nullable = true)
 |-- sum_3m: long (nullable = true)
 |-- avg_3m: double (nullable = true)
 |-- std_3m: double (nullable = true)
 |-- median_3m: long (nullable = true)
 |-- zero_count_3m: long (nullable = true)
 |-- lag_sum_3m: long (nullable = true)
 |-- lag_avg_3m: double (nullable = true)
 |-- lag_std_3m: double (nullable = true)
 |-- lag_med_3m: long (nullable = true)
 |-- diff_sum_3m: long (nullable = true)
 |-- diff_avg_3m: double (nullable = true)
 |-- diff_std_3m: double (nullab

* Esses valores nulos são de produtos que não tiveram demanda anteriormente, e como não tenho o motivo, criei variáveis de contagem de zeros numa janela móvel, essa contagem de zeros (zero_count_km) é um forte sinal de regularidade x esporadicidade.

Produtos esporádicos (“long tail”) normalmente têm muitos meses com zero vendas; produtos regulares quase nunca.

In [22]:
# Warehouses e splits
warehouses = ['j','a','s','c']
splits     = ['train', 'val']

for wh in warehouses:
    for split in splits:

        var_name = f"df_{split}_warehouse_{wh}"
        df = globals().get(var_name)
        if df is None:
            print(f"❌ {var_name} não existe, pulando.")
            continue

        # nova ordem: todas as colunas exceto 'target', depois 'target' no final
        cols = [c for c in df.columns if c != "target"]
        new_order = cols + ["target"]

        # aplica o select com a nova ordem
        df_reordered = df.select(new_order)

        # reatribui à variável e registra como view
        globals()[var_name] = df_reordered
        df_reordered.createOrReplaceTempView(var_name)

        # Confirmação
        print(f"✔ {var_name} colunas reordenadas:")
        print(df_reordered.columns)


✔ df_train_warehouse_j colunas reordenadas:
['Product_Code', 'month_dt', 'count_demand', 'months_since_last', 'lag_1m', 'diff_1m', 'ratio_1m', 'lag_12m', 'diff_12m', 'ratio_12m', 'delta_ratio_1m', 'sum_3m', 'avg_3m', 'std_3m', 'median_3m', 'zero_count_3m', 'lag_sum_3m', 'lag_avg_3m', 'lag_std_3m', 'lag_med_3m', 'diff_sum_3m', 'diff_avg_3m', 'diff_std_3m', 'diff_med_3m', 'ratio_sum_3m', 'ratio_avg_3m', 'ratio_std_3m', 'ratio_med_3m', 'sum_6m', 'avg_6m', 'std_6m', 'median_6m', 'zero_count_6m', 'lag_sum_6m', 'lag_avg_6m', 'lag_std_6m', 'lag_med_6m', 'diff_sum_6m', 'diff_avg_6m', 'diff_std_6m', 'diff_med_6m', 'ratio_sum_6m', 'ratio_avg_6m', 'ratio_std_6m', 'ratio_med_6m', 'sum_9m', 'avg_9m', 'std_9m', 'median_9m', 'zero_count_9m', 'lag_sum_9m', 'lag_avg_9m', 'lag_std_9m', 'lag_med_9m', 'diff_sum_9m', 'diff_avg_9m', 'diff_std_9m', 'diff_med_9m', 'ratio_sum_9m', 'ratio_avg_9m', 'ratio_std_9m', 'ratio_med_9m', 'sum_12m', 'avg_12m', 'std_12m', 'median_12m', 'zero_count_12m', 'lag_sum_12m', 'la

# Check de sanidade

In [36]:
# Warehouses e splits
warehouses = ['j','a','s','c']
splits     = ['train', 'val']

for wh in warehouses:
    for split in splits:
        var_name = f"df_{split}_warehouse_{wh}"
        df = globals().get(var_name)
        if df is None:
            print(f"❌ {var_name} não encontrado, pulando.")
            continue

        num_rows = df.count()
        num_cols = len(df.columns)

        label = split.upper()
        print(f"[{label} - WH {wh.upper()}] Linhas: {num_rows} | Colunas: {num_cols}")
    print("-" * 50)

[TRAIN - WH J] Linhas: 74293 | Colunas: 86
[VAL - WH J] Linhas: 7342 | Colunas: 86
--------------------------------------------------
[TRAIN - WH A] Linhas: 18220 | Colunas: 86
[VAL - WH A] Linhas: 1819 | Colunas: 86
--------------------------------------------------
[TRAIN - WH S] Linhas: 21910 | Colunas: 86
[VAL - WH S] Linhas: 1285 | Colunas: 86
--------------------------------------------------
[TRAIN - WH C] Linhas: 9649 | Colunas: 86
[VAL - WH C] Linhas: 1100 | Colunas: 86
--------------------------------------------------


# Salvando os DF's na camada Refined, separando treino e val, de cada um no seu prorpio diretorio.

In [46]:
from pyspark.sql import DataFrame

# Diretório Base.
refined_base = "/content/drive/MyDrive/PREVISAO_DEMANDA/REFINED"

# Warehouses e splits
warehouses = ['J','A','S','C']
splits     = ['train', 'val']

for wh in warehouses:
    feature_folder = f"{refined_base}/FEATURES_{wh}"

    for split in splits:
        # monta o nome da variável, ex: "df_train_warehouse_s"
        var_name = f"df_{split}_warehouse_{wh.lower()}"
        df: DataFrame = globals().get(var_name)

        if df is None:
            print(f"Atenção: {var_name} não existe. Pulando.")
            continue

        out_path = f"{feature_folder}/{split}"
        df.write \
          .mode("overwrite") \
          .parquet(out_path)

        print(f"✔ Gravado {var_name} em: {out_path}")

✔ Gravado df_train_warehouse_j em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_J/train
✔ Gravado df_val_warehouse_j em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_J/val
✔ Gravado df_train_warehouse_a em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_A/train
✔ Gravado df_val_warehouse_a em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_A/val
✔ Gravado df_train_warehouse_s em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_S/train
✔ Gravado df_val_warehouse_s em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_S/val
✔ Gravado df_train_warehouse_c em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_C/train
✔ Gravado df_val_warehouse_c em: /content/drive/MyDrive/PREVISAO_DEMANDA/REFINED/FEATURES_C/val
