In [1]:
from pandas import DataFrame, read_parquet, to_numeric, to_datetime
from pathlib import Path

# Load the dataset and standardize column names
df: DataFrame = read_parquet("../data/staging/RWventas.parquet", engine="pyarrow")

# columns to convert to float
curated: Path = Path("../data/curated")
columnas_float: list[str] = ["precio_unitario", "descuento", "costo_envio", "total"]

# trasnformations type columns
df["cantidad"] = to_numeric(df["cantidad"], errors="coerce").astype("Int64")
df["fecha"] = to_datetime(df["fecha"], errors="coerce", format="%Y-%m-%d")
df[columnas_float] = df[columnas_float].apply(to_numeric, errors="coerce")

# drop rows with null values
df = df.dropna()

# Save the cleaned dataset
if not curated.exists():
    curated.mkdir(parents=True)
df.to_parquet(curated / "RWventas.parquet", index=False, engine="pyarrow")

df.head(50)


Unnamed: 0,ciudad,fecha,producto,tipo_producto,cantidad,precio_unitario,tipo_venta,tipo_cliente,descuento,costo_envio,total
0,Antofagasta,2025-11-28,Leche,Alimentopercedero,2,1587.0,Online,Minorista,0.2,0.0,2539.0
2,Valparaíso,2025-12-07,Café,Hogar,1,3882.0,Tiendafísica,Minorista,0.0,0.0,3882.0
3,Sevilla,2025-12-01,Té,Snack,5,2060.0,Distribuidor,Corporativo,0.15,0.0,8755.0
4,Sevilla,2025-11-18,Chocolate,Snack,1,3712.0,Online,Minorista,0.05,250000.0,8526.0
5,Córdoba,2025-11-20,Café,Lácteo,5,4041.0,Callcenter,Corporativo,0.2,0.0,16164.0
6,Ciudad De México,2025-11-15,Queso,Hogar,7,4535.0,Online,Corporativo,0.05,10000.0,40157.0
7,Houston,2025-11-30,Chocolate,Snack,4,4198.0,Distribuidor,Minorista,0.15,0.0,14273.0
8,Mendoza,2025-11-20,Té,Bebida,6,1243.0,Callcenter,Mayorista,0.1,0.0,6712.0
9,Antofagasta,2025-11-15,Queso,Alimentopercedero,1,1357.0,Online,Corporativo,0.05,10000.0,11289.0
11,Chicago,2025-11-26,Queso,Bebida,7,1509.0,Tiendafísica,Gobierno,0.2,0.0,8450.0
