# Loading ERP cutomer file

## import Necessory functions

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col, current_date

### Load Erp Cust table

In [0]:
df = spark.table("dev_project.bronze.erp_cust_az12")

## Remove extra spaces in string col

In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType):
        df = df.withColumn(field.name, trim(col(field.name)))

## Clean CustomerID

In [0]:
df = df.withColumn("CID",
                    F.when(col("CID").startswith("NAS"),
                           F.substring(col("CID"), 4, F.length(col("CID")))
                           ).otherwise(col("CID"))
                    )

## Validating date Col

In [0]:
df = df.withColumn(
                   "BDATE",
                   F.when(
                       # bith date should less than current date
                       col("BDATE") > current_date(), None
                   ).otherwise(col("BDATE"))
                   )

## Gender col Normalization

In [0]:
df = df.withColumn(
                   "GEN",
                   F.when(F.upper(col("GEN")).isin("M", "MALE"), "Male")
                   .when(F.upper(col("GEN")).isin("F", "FEMALE"), "Female")
                   .otherwise("n/a")
                   )

## Rename 

In [0]:
RENAME_MAP = {
    "CID": "customer_number",
    "BDATE": "birth_date",
    "GEN": "gender"
}
for old_name, new_name in RENAME_MAP.items():
    df = df.withColumnRenamed(old_name, new_name)

df = df.withColumn("_ingest_time", F.current_timestamp())

## Writing into silver 

In [0]:
df.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("dev_project.silver.erp_cust_az12")

In [0]:
df.display()

In [0]:
%sql
SELECT * FROM dev_project.silver.erp_cust_az12 LIMIT 10;