# Extract customer data
1. Remove records with NULL customer_id
2. Remove exact duplicate records
3. Remove duplicate records based on created_timestamp
4. CAST the columns to the correct data type
5. Write transformed data to the Silver schema

## 0. Read data from table

In [0]:
df = spark.table('gizmobox.bronze.py_customers')

display(df)

In [0]:
# another way to read a table with more flexibility (can add options)
df = spark.read.table('gizmobox.bronze.py_customers')
display(df)

## 1. Remove records with NULL customer_id

In [0]:
df = spark.read.table('gizmobox.bronze.py_customers')
df = df.filter(df.customer_id.isNotNull())

display(df)

## 2. Remove exact duplicate records

In [0]:
df = df.dropDuplicates()

display(df)

## 3. Remove duplicate records based on created_timestamp

In [0]:
import pyspark.sql.functions as F

df_max_ts = df.groupBy('customer_id')\
            .agg(F.max('created_timestamp').alias('max_created_timestamp'))\
            .select('customer_id', 'max_created_timestamp')

df = df.join(df_max_ts, on=[
    (df.customer_id == df_max_ts.customer_id) & \
    (df.created_timestamp == df_max_ts.max_created_timestamp)])\
      .select(df['*'])

display(df)


## 4. CAST the column values to correct data types

In [0]:
# df = df.withColumn('created_timestamp', F.to_timestamp(df['created_timestamp']))\
#       .withColumn('date_of_birth', F.to_date(df['date_of_birth']))\
#       .withColumn('member_since', F.to_date(df['member_since']))\
#       .select('created_timestamp', 'customer_id', 'customer_name',
#               'date_of_birth', 'email', 'member_since', 'telephone')

df = df.select(
      F.column('created_timestamp').cast('timestamp'),
      F.column('customer_id').cast('int'),
      F.column('customer_name'),
      F.column('date_of_birth').cast('date'),
      F.column('email'),
      F.column('member_since').cast('date'),
      F.column('telephone')
)
      
display(df)

## 4. Write data to delta table in Silver layer

In [0]:
df.writeTo('gizmobox.silver.py_customers').createOrReplace()

In [0]:
%sql
SELECT * FROM gizmobox.silver.py_customers;

In [0]:
%sql
DESCRIBE EXTENDED gizmobox.silver.py_customers;