### Data Cleaning

#### General overview of raw data cleanup

This section performs data cleaning operations to:

- Standardize column names.
- Remove null values and duplicates.
- Clean special characters from data.
- Convert columns to appropriate data types.
- Filter out invalid or non-positive values.
- Prepare the dataset for analysis.


In [1]:
from pandas import read_csv, DataFrame, to_datetime, to_numeric
from pathlib import Path
from re import compile

# DATA LOADING
data: DataFrame = read_csv(
    (path := Path("../data/")) / "raw/Sales.csv",
    engine="pyarrow",
    dtype_backend="pyarrow",
)

# COLUMN REORDERING CONFIGURATION
order: dict[str, str] = {
    "fecha": "producto",
    "producto": "tipo_producto",
    "tipo_producto": "cantidad",
    "cantidad": "precio_unitario",
    "precio_unitario": "ciudad",
    "ciudad": "fecha",
    "tipo_venta": "pais",
    "tipo_cliente": "tipo_venta",
    "descuento": "tipo_cliente",
}

# PATTERN DEFINITION
re_pattern = compile(
    r"[#{}\[\]%@*<>¿?=+~|^…¡!()「」『』]|[\x00-\x1F\x7F]|[\U0001F300-\U0001F6FF]",
)

# CLEAN COLUMN NAMES
data.columns = [
    re_pattern.sub("", col).strip().replace(" ", "_").lower() for col in data.columns
]

# CLEAN DATA VALUES
for col in data.select_dtypes(include=["object", "string"]).columns:
    data[col] = (
        data[col]
        .astype(str)
        .str.replace(re_pattern, "", regex=True)
        .str.strip()
        .str.replace("_", " ")
        .str.capitalize()
        .replace({"Nan": None, "Na": None, "None": None})
    )

# REORDER COLUMN VALUES
data[list(order.keys())] = data[list(order.values())]
del data["pais"]

# REMOVE NULL VALUES
data = data.dropna()


# DATA TYPE CONVERSIONS
data["fecha"] = to_datetime(data["fecha"], format="%Y-%m-%d", errors="coerce")

# Convert numeric columns with proper null handling
numeric_columns = {
    "cantidad": "Int64",
    "precio_unitario": "Float64",
    "descuento": "Float64",
    "costo_envio": "Float64",
}

for col, dtype in numeric_columns.items():
    data[col] = to_numeric(data[col], errors="coerce").astype(dtype)

# DATA PREVIEW
print(f"\n{'=' * 10} Dataset Info {'=' * 10}")
data.info()

print(f"\n{'=' * 10} First And Last 5 Rows {'=' * 10}")
display(data.head(-1))

# SAVE CLEANED DATA
if not (file := path / "staging/Sales.parquet").parent.exists():
    (path / "staging").mkdir(parents=True, exist_ok=True)
data.to_parquet(file, index=False, engine="pyarrow")



<class 'pandas.core.frame.DataFrame'>
Index: 1238655 entries, 0 to 1249999
Data columns (total 10 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   fecha            1238655 non-null  datetime64[ns]
 1   producto         1238655 non-null  object        
 2   tipo_producto    1238655 non-null  object        
 3   cantidad         1237853 non-null  Int64         
 4   precio_unitario  1237806 non-null  Float64       
 5   ciudad           1238655 non-null  object        
 6   tipo_venta       1238655 non-null  object        
 7   tipo_cliente     1238655 non-null  object        
 8   descuento        1237791 non-null  Float64       
 9   costo_envio      1237853 non-null  Float64       
dtypes: Float64(3), Int64(1), datetime64[ns](1), object(5)
memory usage: 108.7+ MB



Unnamed: 0,fecha,producto,tipo_producto,cantidad,precio_unitario,ciudad,tipo_venta,tipo_cliente,descuento,costo_envio
0,2025-10-30,Arepa,Abarrotes,2,3681.0,Santiago,Online,Minorista,0.2,5889.0
1,2025-11-17,Arepa,Abarrotes,7,2321.0,Córdoba,Distribuidor,Gobierno,0.15,13809.0
2,2025-10-22,Leche,Lácteo,9,3540.0,Barranquilla,Distribuidor,Gobierno,0.2,25488.0
3,2025-10-20,Cereal,Lácteo,3,3287.0,New york,Tienda física,Gobierno,0.05,9367.0
4,2025-10-20,Leche,Hogar,2,3414.0,Madrid,Distribuidor,Mayorista,0.0,6828.0
...,...,...,...,...,...,...,...,...,...,...
1249994,2025-10-31,Té,Alimento percedero,8,4822.0,Tijuana,Online,Minorista,0.05,46647.0
1249995,2025-10-30,Café,Abarrotes,10,4088.0,Miami,Tienda física,Corporativo,0.2,32704.0
1249996,2025-11-05,Pan,Abarrotes,9,3123.0,Mendoza,Distribuidor,Corporativo,0.05,26701.0
1249997,2025-10-27,Gaseosa,Snack,7,3821.0,Chicago,Distribuidor,Mayorista,0.2,21397.0
