In [5]:
import pandas as pd

products = pd.read_parquet("../data/raw/products.parquet")
countries = pd.read_parquet("../data/raw/countries.parquet")
sales_orders = pd.read_parquet("../data/raw/sales_orders.parquet")
order_lines = pd.read_parquet("../data/raw/order_lines.parquet")
CRM = pd.read_parquet("../data/raw/CRM.parquet")


def convert_numeric_columns(data, ignore=[]):
    for column in data.columns:
        if column not in ignore and not pd.api.types.is_object_dtype(data[column]):
            try:
                data[column] = pd.to_numeric(data[column], errors="coerce")  
            except Exception as e:
                print(f"⚠️ No se pudo convertir '{column}': {e}")

    return data

convert_numeric_columns(data=products)
convert_numeric_columns(data=countries)
convert_numeric_columns(data=sales_orders, ignore=["date_order"])
convert_numeric_columns(data=order_lines)
convert_numeric_columns(data=CRM)

sales_orders["date_order"] = pd.to_datetime(sales_orders["date_order"], errors="coerce")


print("\n📌 Datasets Dtypes:")
print("\n Products:")
print(products.dtypes)
print("\n Countries:")
print(countries.dtypes)
print("\n Sales orders:")
print(sales_orders.dtypes)
print("\n Order lines:")
print(order_lines.dtypes)
print("\n CRM:")
print(CRM.dtypes)


📌 Datasets Dtypes:

 Products:
id              int64
name           object
category       object
image_route    object
dtype: object

 Countries:
id       int64
code    object
name    object
dtype: object

 Sales orders:
order_id                    object
partner_invoice             object
partner_shipping            object
date_order          datetime64[ns]
state                       object
amount_untaxed             float64
amount_tax                 float64
amount_total               float64
country_code                object
dtype: object

 Order lines:
id               object
order_id         object
product_name     object
quantity        float64
unit_price      float64
subtotal        float64
dtype: object

 CRM:
lead_id          int64
name            object
email_from      object
phone           object
stage           object
contact_name    object
image_html      object
dtype: object


In [6]:
def fill_numeric_columns(data, ignore=[]):
    for column in data.columns:
        if column not in ignore and not pd.api.types.is_object_dtype(data[column]):
            try:
                # datos de ventas pueden ser facilmente rellnados con 0 si no estan
                data[column] = data[column].fillna(0) 
            except Exception as e:
                print(f"⚠️ No se pudo rellenar '{column}': {e}")

    return data

fill_numeric_columns(data=products)
fill_numeric_columns(data=countries)
fill_numeric_columns(data=sales_orders, ignore=["date_order"])
fill_numeric_columns(data=order_lines)
fill_numeric_columns(data=CRM)

sales_orders["date_order"] = sales_orders["date_order"].fillna("ffill")


In [7]:
CRM = CRM.drop("image_html", axis=1)
products = products.drop("image_route", axis=1)
products.drop_duplicates(inplace=True)
countries.drop_duplicates(inplace=True)
sales_orders.drop_duplicates(inplace=True)




In [8]:
def convert_object_lowercase(data, ignore=[]):
    for column in data.columns:
        if column not in ignore and pd.api.types.is_object_dtype(data[column]):
            try:
                data[column] =  data[column].fillna("").str.lower().str.strip()
            except Exception as e:
                print(f"⚠️ No se pudo convertir '{column}': {e}")

    return data

convert_object_lowercase(data=products)
convert_object_lowercase(data=countries)
convert_object_lowercase(data=sales_orders, ignore=["date_order"])
convert_object_lowercase(data=order_lines)
convert_object_lowercase(data=CRM)

print("\n📌 Primeros registros de cada dataset:")
print("\n Products")
print(products.head())
print("\n Countries")
print(countries.head())
print("\n Sales orders")
print(sales_orders.head())
print("\n Order lines")
print(order_lines.head())
print("\n CRM")
print(CRM.head())


📌 Primeros registros de cada dataset:

 Products
   id                                    name     category
0   5                           communication          all
1   7                       standard delivery   deliveries
2   6                                expenses     expenses
3  12  audifonos argom bluetooth arg-hs-2552b  electronics
4   1                                   meals          all

 Countries
   id code            name
0   3   af     afghanistan
1   6   al         albania
2  62   dz         algeria
3  11   as  american samoa
4   1   ad         andorra

 Sales orders
  order_id   partner_invoice    partner_shipping          date_order state  \
0   s00051    lilianna perez  harvard university 2025-01-29 00:23:15  sale   
1   s00050     milena garcia           microsoft 2025-01-28 21:39:34  sale   
2   s00049     jordana alphy  harvard university 2025-01-28 21:39:00  sale   
3   s00048  humberto marcebo           microsoft 2025-01-28 21:38:35  sale   
4   s00047  humbe

In [9]:
products.to_parquet("../data/processed/products.parquet")
countries.to_parquet("../data/processed/countries.parquet")
sales_orders.to_parquet("../data/processed/sales_orders.parquet")
order_lines.to_parquet("../data/processed/order_lines.parquet")
CRM.to_parquet("../data/processed/CRM.parquet")