In [52]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

# Create SparkContext
sparkContext = SparkContext.getOrCreate()
# Create Glue Context
glueContext = GlueContext(sparkContext)
# Get spark session
spark = glueContext.spark_session

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [136]:
# gdf = Glue Dynamic Frame
gdf = glueContext.create_dynamic_frame.from_options(
    connection_type="s3", 
    connection_options=dict(
        paths=[
            "s3://aws-data-lab-sanhe-for-everything-us-east-2/poc/2022-02-26-learn_big_data_on_aws/dataset/ds_003_walmart_mongodb/"
        ],
        recurse=True,
    ),
    format="json",
    format_options=dict(multiLine=True),
    transformation_ctx="datasource",
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [98]:
gdf.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
|-- order_id: string
|-- create_time: long
|-- customer: struct
|    |-- customer_id: int
|    |-- email: string
|    |-- name: string
|    |-- dob: string
|    |-- gender: int
|    |-- billing_address: string
|    |-- shipping_address: string
|-- items: array
|    |-- element: struct
|    |    |-- item_id: int
|    |    |-- name: string
|    |    |-- price: double
|    |    |-- quantity: int

In [137]:
gdf.toDF().show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------------+--------------------+--------------------+
|            order_id|  create_time|            customer|               items|
+--------------------+-------------+--------------------+--------------------+
|c844cf1d-d8e5-479...|1641013042000|[226, emily79@exa...|[[161, agent, 1.7...|
|5b320f98-4ab1-429...|1641039937000|[98, langsherri@e...|[[155, thing, 1.4...|
|29079285-1cc7-4a9...|1641078303000|[528, teresa57@ex...|[[61, stop, 32.69...|
|e0f7e2bf-a85b-424...|1641069306000|[861, orrtamara@e...|[[287, hundred, 1...|
|ec4863c9-3b84-4cd...|1641070432000|[761, abullock@ex...|[[287, hundred, 1...|
+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows

In [138]:
gdf.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

6562

In [102]:
# pdf = PySpark Data Frame
pdf = gdf.toDF()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [135]:
from pyspark.sql.functions import explode

# unnest based on an array
pdf_tmp = pdf.select(
    pdf.order_id, 
    pdf.create_time,
    pdf.customer,
    explode(pdf.items).alias("item"),
)

pdf_denormalized = pdf_tmp.select(
    pdf_tmp.order_id.alias("order_id"),
    pdf_tmp.create_time.alias("order_create_time"),
    pdf_tmp.customer["dob"].alias("customer_date_of_birth"),
    pdf_tmp.customer["customer_id"].alias("customer_id"),
    pdf_tmp.customer["email"].alias("customer_id"),
    pdf_tmp.customer["name"].alias("customer_name"),
    pdf_tmp.customer["dob"].alias("customer_dob"),
    pdf_tmp.customer["billing_address"].alias("customer_billing_address"),
    pdf_tmp.customer["shipping_address"].alias("customer_shipping_address"),
    pdf_tmp.item["item_id"].alias("item_id"),
    pdf_tmp.item["name"].alias("item_name"),
    pdf_tmp.item["price"].alias("item_price"),
    pdf_tmp.item["quantity"].alias("item_quantity"),
)

pdf_denormalized.show(5, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0---------------------------------------------------------------------------
 order_id                  | f6d627d1-7b08-4482-809e-0225ce5f7e52                   
 order_create_time         | 1641078805000                                          
 customer_date_of_birth    | 2008-02-12                                             
 customer_id               | 376                                                    
 customer_id               | fjones@example.com                                     
 customer_name             | Brandon Brown                                          
 customer_dob              | 2008-02-12                                             
 customer_billing_address  | 100 Barrett Roads
North Nathaniel, OK 36025            
 customer_shipping_address | 41939 Aguilar Trail Apt. 702
New Teresahaven, MN 28819 
 item_id                   | 111                                                    
 item_name                 | reason                              