In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
import sys
# -----------------------------
# Ajouter le repo au Python Path
# -----------------------------
sys.path.append("/Workspace/Users/mandu543@gmail.com/databricks-ecommerce/Pipelines/")

from lib.utils import *

In [0]:
DB_ECOMMERCE_BRANDS = "ecommerce_brands"
DB_ECOMMERCE_PRODUCTS = "ecommerce_products"
DB_ECOMMERCE_CATEOGIES = "ecommerce_categories"
DB_ECOMMERCE_CUSTOMERS = "ecommerce_customers"
DB_ECOMMERCE_ORDER_ITEMS = "ecommerce_order_items"

### Table Ecommerce_Brands

In [0]:
df_brz_brands = getDF(spark, BRONZE_ZONE, DB_ECOMMERCE_BRANDS)
display(df_brz_brands)


**Anomalies :** 
- `brand_code` : caractères spéciaux 
- `brand_name` : espace en début et fin de chaine

In [0]:
df_brz_brands = normalize_dataframe(df_brz_brands)
df_brz_brands = setUppercase(df_brz_brands,["category_code"])
df_brz_brands.show(15)

### Table Ecommerce_Products

In [0]:
df_brz_products = getDF(spark, BRONZE_ZONE, DB_ECOMMERCE_PRODUCTS)
display(df_brz_products)


**Anomalies :** 
- `weight_grams` : unité de poids 'g' 
- `length_cm` : des ',' au lieu de '.'
- `width_cm` : des ',' au lieu de '.'
- `height_cm` : des ',' au lieu de '.'
- `brands_code` : en minuscule

In [0]:
df_brz_products = df_brz_products.withColumn("weight_grams", 
                                             regexp_replace(col("weight_grams"), r"[^0-9]", ""))

df_brz_products = df_brz_products.withColumn("length_cm", 
                                             regexp_replace(col("length_cm"), r"[,]", "."))

df_brz_products = df_brz_products.withColumn("width_cm", 
                                             regexp_replace(col("width_cm"), r"[,]", "."))

df_brz_products = df_brz_products.withColumn("height_cm", 
                                             regexp_replace(col("height_cm"), r"[,]", "."))

df_brz_products = setUppercase(df_brz_products,["category_code"])

df_brz_products = setUppercase(df_brz_products,["brand_code"])

display(df_brz_products)

In [0]:
columns_products_types = {
    "product_id": "long",
    "sku": "string",
    "category_code": "string",
    "brand_code": "string",
    "color": "string",
    "size": "string",    
    "material": "string",
    "weight_grams": "integer",
    "length_cm": "double",
    "width_cm": "double",
    "height_cm": "double",
    "rating_count": "integer"
}

df_brz_products=cast_columns_spark_types(df_brz_products, columns_products_types)

display(df_brz_products)

### Table Ecommerce_Categories

In [0]:
df_brz_categories = getDF(spark, BRONZE_ZONE, DB_ECOMMERCE_CATEOGIES)
df_brz_categories.show(150)

**Anomalies :** 
- `category_code` : minuscule

In [0]:
df_brz_categories = normalize_dataframe(df_brz_categories)
df_brz_categories = setUppercase(df_brz_categories,["category_code"])
display(df_brz_categories)

### Table Ecommerce Customers

In [0]:
df_brz_customers = getDF(spark, BRONZE_ZONE, DB_ECOMMERCE_CUSTOMERS)
display(df_brz_customers)

**Anomalies :** 
- `phone` : numero null et decimal


In [0]:
df_brz_customers = setPhoneNumber(df_brz_customers,["phone"])
df_brz_customers = normalize_dataframe(df_brz_customers)

display(df_brz_customers)

### Table Ecommerce Orders_Items

In [0]:
df_brz_order_items = getDF(spark, BRONZE_ZONE, DB_ECOMMERCE_ORDER_ITEMS)
display(df_brz_order_items)

%md
**Anomalies :** 
- `quantity` : chaine de caractère et nombre
- `unit_price` : $ dans la colonne
- `discount_pct` : % dans la colonne


In [0]:
df_brz_order_items = normalize_dataframe(df_brz_order_items)
df_brz_order_items = df_brz_order_items.replace("Two","2", subset=["quantity"])

columns_types = {
    "dt": "date",
    "order_ts": "timestamp",
    "customer_id": "string",
    "order_id": "long",
    "item_seq": "integer",
    "product_id": "long",
    "quantity": "integer",
    "unit_price_currency": "string",
    "unit_price": "integer",
    "discount_pct": "integer",
    "tax_amount": "integer",
    "channel": "string",
    "coupon_code": "string"
}

df_brz_order_items=cast_columns_spark_types(df_brz_order_items, columns_types)

display(df_brz_order_items)


### Insertion Zone Silver

In [0]:
# 1. Chargement de la table Bronze (Source)
# La table Bronze a une colonne '_ingested_at' 


# 2. Définition des paramètres
params = {
    "spark": spark,
    "source_df": df_brz_brands,
    "target_table": SILVER_ZONE+".slv_ecommerce_brands",
    "table_name": "Silver Table Brands",
    "reject_path": "/Volumes/workspace/datasets/ecommerce/1_errors_data/",
    "business_key": ["brand_code"] # Liste des colonnes qui identifient une ligne unique

}

# 3. Exécution de la fonction

process_bronze_to_silver(**params)
 

In [0]:
param_table_products = {
    "spark": spark,
    "source_df": df_brz_products,
    "target_table": SILVER_ZONE+".slv_ecommerce_products",
    "table_name": "Silver Table Products",
    "reject_path": "/Volumes/workspace/datasets/ecommerce/1_errors_data/",
    "business_key": ["product_id"] # Liste des colonnes qui identifient une ligne unique

}

process_bronze_to_silver(**param_table_products)

In [0]:
param_table_categories = {
    "spark": spark,
    "source_df": df_brz_categories,
    "target_table": SILVER_ZONE+".slv_ecommerce_categories",
    "table_name": "Silver Table Categories",
    "reject_path": "/Volumes/workspace/datasets/ecommerce/1_errors_data/",
    "business_key": ["category_code"] # Liste des colonnes qui identifient une ligne unique

}

process_bronze_to_silver(**param_table_categories)

In [0]:
param_table_customers = {
    "spark": spark,
    "source_df": df_brz_customers,
    "target_table": SILVER_ZONE+".slv_ecommerce_customers",
    "table_name": "Silver Table Customers",
    "reject_path": "/Volumes/workspace/datasets/ecommerce/1_errors_data/",
    "business_key": ["customer_id"] # Liste des colonnes qui identifient une ligne unique

}

process_bronze_to_silver(**param_table_customers)

In [0]:
param_table_order_items = {
    "spark": spark,
    "source_df": df_brz_order_items,
    "target_table": SILVER_ZONE+".slv_ecommerce_order_items",
    "table_name": "Silver Table Order Items",
    "reject_path": "/Volumes/workspace/datasets/ecommerce/1_errors_data/",
    "business_key": ["dt", "customer_id","order_id","product_id" ] # Liste des colonnes qui identifient une ligne unique

}

process_bronze_to_silver(**param_table_order_items)