In [3]:
from pandas import DataFrame, read_csv
from pathlib import Path

# Define the pattern for special characters to be removed
pattern = r"[@/\\{}(!_#%()=*]"
staging = Path("../data/staging/")

# Load the dataset and standardize column names
df: DataFrame = read_csv("../data/raw/RWventas.csv", engine="pyarrow")
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# clean special characters, title case, and trim whitespace if not null
for column in df.select_dtypes(include=["string", "object"]).columns:
    df[column] = df[column].astype("string")

    df[column] = df[column].mask(
        df[column].notna(),
        df[column].str.replace(pattern, "", regex=True).str.title().str.strip(),
    )

# delete rows with any null values
# df = df.dropna()

# Save the cleaned dataset
if not staging.exists():
    staging.mkdir(parents=True)
df.to_parquet(staging / "RWventas.parquet", index=False, engine="pyarrow")

print("Data cleaning completed successfully.")

display(df.head(50))

Data cleaning completed successfully.


Unnamed: 0,ciudad,fecha,producto,tipo_producto,cantidad,precio_unitario,tipo_venta,tipo_cliente,descuento,costo_envio,total
0,Antofagasta,2025-11-28,Leche,Alimentopercedero,2.0,1587.0,Online,Minorista,0.2,0.0,2539.0
1,Monterrey,2025-11-29,Leche,Hogar,5.0,,Callcenter,Mayorista,0.2,10000.0,20412.0
2,Valparaíso,2025-12-07,Café,Hogar,1.0,3882.0,Tiendafísica,Minorista,0.0,0.0,3882.0
3,Sevilla,2025-12-01,Té,Snack,5.0,2060.0,Distribuidor,Corporativo,0.15,0.0,8755.0
4,Sevilla,2025-11-18,Chocolate,Snack,1.0,3712.0,Online,Minorista,0.05,250000.0,8526.0
5,Córdoba,2025-11-20,Café,Lácteo,5.0,4041.0,Callcenter,Corporativo,0.2,0.0,16164.0
6,Ciudad De México,2025-11-15,Queso,Hogar,7.0,4535.0,Online,Corporativo,0.05,10000.0,40157.0
7,Houston,2025-11-30,Chocolate,Snack,4.0,4198.0,Distribuidor,Minorista,0.15,0.0,14273.0
8,Mendoza,2025-11-20,Té,Bebida,6.0,1243.0,Callcenter,Mayorista,0.1,0.0,6712.0
9,Antofagasta,2025-11-15,Queso,Alimentopercedero,1.0,1357.0,Online,Corporativo,0.05,10000.0,11289.0
