# Glue ETL Transformation - Flatten Array

In [51]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pprint import pprint

# Create SparkContext
sparkContext = SparkContext.getOrCreate()
# Create Glue Context
glueContext = GlueContext(sparkContext)
# Get spark session
spark = glueContext.spark_session

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
class Config:
    bucket = "aws-data-lab-sanhe-for-everything-us-east-2"
    prefix = "poc/learn-big-data-on-aws/glue-job-examples/tranform-flatten-array"
    n_files = 10
    n_records_per_file = 1000
    n_product_category = 20
    
    @property
    def s3path_prefix(self):
        return S3Path(self.bucket, self.prefix)
    
config = Config()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
# gdf = Glue Dynamic Frame
gdf = glueContext.create_dynamic_frame.from_options(
    connection_type="s3", 
    connection_options=dict(
        paths=[
            f"s3://{config.bucket}/{config.prefix}/"
        ],
        recurse=True,
    ),
    format="json",
    format_options=dict(multiLine=True),
    transformation_ctx="datasource",
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# print data schema
gdf.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
|-- id: int
|-- name: string
|-- price: int
|-- specs: struct
|    |-- color: string
|-- categories: array
|    |-- element: string
|-- reviews: array
|    |-- element: struct
|    |    |-- rank: int
|    |    |-- comment: string

In [11]:
gdf.toDF().show(3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id         | 1                                                                                                                                                                                                                                                  
 name       | marriage                                                                                                                                                                                                                                           
 price      | 83                                                                                                                                                                                                                  

## UNNEST

There are two type of things could be unnest:

1. struct
2. array

For struct, it move leaf nodes to root level, and use full json path as the key:

```python
# Input
{"id": 1, "specs": {"color": "red"}}

# Output
{"id": 1, "specs.color": "red"}
```

For array, one record expand to number of records that equal to the length of the array:

```python
# Input
{"id": 1, "categories": ["cate1", "cate2", "cate3"]}

# Output
{"id": 1, "categories": "cate1"}
{"id": 1, "categories": "cate2"}
{"id": 1, "categories": "cate3"}
```

If you want to unnest based on multiple array fields, it expand to the combination of each fields:

```python
# Input
{"id": "7e3f, "array1": [1, 2], "array2": [3, 4, 5]}

# Output
{"id": "7e3f", "array1": 1, "array2": 3}
{"id": "7e3f", "array1": 1, "array2": 4}
{"id": "7e3f", "array1": 1, "array2": 5}
{"id": "7e3f", "array1": 2, "array2": 3}
{"id": "7e3f", "array1": 2, "array2": 4}
{"id": "7e3f", "array1": 2, "array2": 5}
```

### UNNEST struct example

In [34]:
gdf_unnest_struct_selected = SelectFields.apply(frame=gdf, paths=["id", "specs"])
gdf_unnest_struct_selected.toDF().show(3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0-------------
 id    | 1            
 specs | [Red]        
-RECORD 1-------------
 id    | 2            
 specs | [DarkOrange] 
-RECORD 2-------------
 id    | 3            
 specs | [LightGreen] 
only showing top 3 rows

In [35]:
# apply the ``UnnestFrame`` transformation operator
gdf_unnested_struct = UnnestFrame.apply(frame=gdf_unnest_struct_selected)
for row in gdf_unnested.toDF().toPandas().head(3).to_dict(orient="records"):
    print(row)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{'id': 1, 'specs.color': 'Red'}
{'id': 2, 'specs.color': 'DarkOrange'}
{'id': 3, 'specs.color': 'LightGreen'}

### UNNEST array example

In [40]:
# Double check the "before" state
gdf.toDF().show(3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id         | 1                                                                                                                                                                                                                                                  
 name       | marriage                                                                                                                                                                                                                                           
 price      | 83                                                                                                                                                                                                                  

In [41]:
# import the explode function
# ref: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.explode.html
from pyspark.sql.functions import explode

# pdf = PySpark Data Frame, convert to PySpark Data Frame
pdf = gdf.toDF()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
# unnest based on an array of string
pdf_unnest_array_of_string = pdf.select(
    pdf.id,
    explode(pdf.categories).alias("category"),
)
pdf_unnest_array_of_string.show(3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0---------------
 id       | 1           
 category | interesting 
-RECORD 1---------------
 id       | 1           
 category | leave       
-RECORD 2---------------
 id       | 2           
 category | spring      
only showing top 3 rows

In [45]:
# unnest based on an array of struct
pdf_unnest_array_of_struct = pdf.select(
    pdf.id,
    explode(pdf.reviews).alias("review"),
)
pdf_unnest_array_of_struct.show(3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0----------------------------------------------------------------
 id     | 1                                                              
 review | [1, Several bad appear make she common heart.]                 
-RECORD 1----------------------------------------------------------------
 id     | 1                                                              
 review | [2, Leader of theory force democratic subject grow operation.] 
-RECORD 2----------------------------------------------------------------
 id     | 1                                                              
 review | [1, Wall just few check only center sense work.]               
only showing top 3 rows

In [None]:
# unnest based on an array of string
pdf_unnest_tmp_1 = pdf.select(
    pdf.id,
    explode(pdf.categories).alias("category"),
    pdf.reviews,
)
pdf_unnest_tmp_2 = pdf_unnest_tmp_1.select(
    pdf_unnest_tmp_1.id,
    pdf_unnest_tmp_1.category,
    explode(pdf_unnest_tmp_1.reviews).alias("review"),
)
gdf_unnest_tmp = DynamicFrame.fromDF(pdf_unnest_tmp_2, glueContext, "gdf_unnest_tmp")
gdf_unnest_everything = UnnestFrame.apply(frame=gdf_unnest_tmp)
gdf_unnest_everything.toDF().show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…