### Data Cleaning

#### General overview of raw data cleanup

This section performs data cleaning operations to:

- Standardize column names.
- Remove null values and duplicates.
- Clean special characters from data.
- Convert columns to appropriate data types.
- Filter out invalid or non-positive values.
- Prepare the dataset for analysis.


In [None]:
from pandas import read_csv, DataFrame, to_datetime, to_numeric
from pathlib import Path
from re import compile

# DATA LOADING
data: DataFrame = read_csv(
    (path:= Path("../data/")) / "raw/Sales.csv",
    engine="pyarrow",
    dtype_backend="pyarrow",
)

# COLUMN REORDERING CONFIGURATION
order: dict[str, str] = {
    "fecha": "producto",
    "producto": "tipo_producto",
    "tipo_producto": "cantidad",
    "cantidad": "precio_unitario",
    "precio_unitario": "ciudad",
    "ciudad": "fecha",
    "tipo_venta": "pais",
    "tipo_cliente": "tipo_venta",
    "descuento": "tipo_cliente",
}

# PATTERN DEFINITION
re_pattern = compile(
    r"[#{}\[\]%@*<>¿?=+~|^…¡!()「」『』]|[\x00-\x1F\x7F]|[\U0001F300-\U0001F6FF]",
)

# CLEAN COLUMN NAMES
data.columns = [
    re_pattern.sub("", col).strip().replace(" ", "_").lower() for col in data.columns
]

# CLEAN DATA VALUES
for col in data.select_dtypes(include=["object", "string"]).columns:
    data[col] = (
        data[col]
        .astype(str)
        # Remove special characters and replace underscores with spaces
        .apply(lambda x: re_pattern.sub("", x).strip().replace("_", " "))
    )

# REORDER COLUMN VALUES
data[list(order.keys())] = data[list(order.values())]
del data["pais"]


# REMOVE NULL VALUES
data = data.dropna()

# DATA TYPE CONVERSIONS
data["fecha"] = to_datetime(data["fecha"], format="%Y-%m-%d", errors="coerce")

# Convert numeric columns with proper null handling
numeric_columns = {
    "cantidad": "Int64",
    "precio_unitario": "Float64",
    "descuento": "Float64",
    "costo_envio": "Float64",
}

for col, dtype in numeric_columns.items():
    data[col] = to_numeric(data[col], errors="coerce").astype(dtype)

# DATA PREVIEW
print(f"\n{'=' * 10} Dataset Info {'=' * 10}")
data.info()

print(f"\n{'=' * 10} First And Last 5 Rows {'=' * 10}")
display(data.head(-1))


# SAVE CLEANED DATA
if not (file := path / "staging/Sales.parquet").exists():
    print(f"\nSaving cleaned data to: {file}")
    data.to_parquet(file, index=False, engine="pyarrow")

print(f"\nFile already exists: {file}")