# Common Data Wrangling Operation



In [54]:
# initialize spark session
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pprint import pprint

# Create SparkContext
sparkContext = SparkContext.getOrCreate()
# Create Glue Context
glueContext = GlueContext(sparkContext)
# Get spark session
spark = glueContext.spark_session

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [55]:
import numpy
import pandas
import pyarrow
import pyspark

package_list = [
    numpy,
    pandas,
    pyarrow,
    pyspark,
]
for package in package_list:
    print(f"{package.__name__}=={package.__version__}")

# import
from datetime import datetime, date, timedelta
    
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pyspark.sql import Row, Column, DataFrame
import pyspark.sql.functions as sql_funcs
import pyspark.sql.types as sql_types

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

numpy==1.14.5
pandas==0.23.4
pyarrow==0.13.0
pyspark==2.4.3

## DataFrame Creation

In [56]:
# 1. create a PySpark DataFrame from a list of rows
pdf = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
pdf

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [57]:
# 2. Create a PySpark DataFrame with an explicit schema.
pdf = spark.createDataFrame([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
pdf

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [58]:
# 3. Create a PySpark DataFrame from a pandas DataFrame
df = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [2., 3., 4.],
    'c': ['string1', 'string2', 'string3'],
    'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
    'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
pdf = spark.createDataFrame(df)
pdf

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [59]:
# 4. Create a PySpark DataFrame from an RDD consisting of a list of tuples.
rdd = spark.sparkContext.parallelize([
    (1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
    (2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
    (3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])
pdf = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
pdf

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]

In [60]:
# The DataFrames created above all have the same results and schema.
pdf.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)

In [61]:
pdf.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

## DataFrame Attribute

Access attribute doesn't trigger any computation

Ref:

- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html

In [62]:
# access all columns as list of str
pdf.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['a', 'b', 'c', 'd', 'e']

In [63]:
# access datatype as list of str
pdf.dtypes

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('a', 'bigint'), ('b', 'double'), ('c', 'string'), ('d', 'date'), ('e', 'timestamp')]

In [64]:
# return schema object (not show schema)
pdf.schema

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

StructType(List(StructField(a,LongType,true),StructField(b,DoubleType,true),StructField(c,StringType,true),StructField(d,DateType,true),StructField(e,TimestampType,true)))

## DataFrame Method

Call method usually triggers computation

Ref:

- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.html

In [65]:
# count number of rows
pdf.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3

In [66]:
# Return the first ``pyspark.sql.Row``
pdf.first()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))

In [67]:
# Return first N of ``pyspark.sql.Row`` as a list
pdf.head(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)), Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)), Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [68]:
# Return first N of ``pyspark.sql.Row`` as a list
pdf.take(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)), Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0)), Row(a=3, b=4.0, c='string3', d=datetime.date(2000, 3, 1), e=datetime.datetime(2000, 1, 3, 12, 0))]

In [69]:
# Return last N of ``pyspark.sql.Row`` as a list (added in 3.0.0)
# pdf.tail(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [70]:
# Pretty print first n rows
pdf.show(3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
-RECORD 1------------------
 a   | 2                   
 b   | 3.0                 
 c   | string2             
 d   | 2000-02-01          
 e   | 2000-01-02 12:00:00 
-RECORD 2------------------
 a   | 3                   
 b   | 4.0                 
 c   | string3             
 d   | 2000-03-01          
 e   | 2000-01-03 12:00:00

In [71]:
# access one column method 1
pdf.a

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Column<b'a'>

In [72]:
# access one column method 2
pdf["a"]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Column<b'a'>

In [73]:
# access multiple column
pdf[["a", "b"]].show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+
|  a|  b|
+---+---+
|  1|2.0|
|  2|3.0|
|  3|4.0|
+---+---+

In [74]:
# drop column and return new dataframe
pdf.drop("c", "d", "e").show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+
|  a|  b|
+---+---+
|  1|2.0|
|  2|3.0|
|  3|4.0|
+---+---+

## Add Column to DataFrame

如果只要添加一列, 用 ``pyspark.sql.DataFrame.withColumn`` 方法. 如果要添加多列, 由于 ``pyspark.sql.DataFrame.withColumn`` 方法需要 projection, 如果用该方法添加多列, 会占用大量内存. 官方建议使用 ``pyspark.sql.DataFrame.select`` 方法同时操作多列.

- https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.withColumn.html#pyspark.sql.DataFrame.withColumn
- https://sparkbyexamples.com/pyspark/pyspark-add-new-column-to-dataframe/

In [75]:
# 1. Add New Column with Constant Value
pdf.withColumn("f", sql_funcs.lit("Alice")).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+-------+----------+-------------------+-----+
|  a|  b|      c|         d|                  e|    f|
+---+---+-------+----------+-------------------+-----+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|Alice|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|Alice|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|Alice|
+---+---+-------+----------+-------------------+-----+

In [76]:
# 2. Add Column Based on Another Column of DataFrame
pdf.withColumn("f", pdf.a * pdf.a).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+-------+----------+-------------------+---+
|  a|  b|      c|         d|                  e|  f|
+---+---+-------+----------+-------------------+---+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|  1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|  4|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|  9|
+---+---+-------+----------+-------------------+---+

In [77]:
# 3. Add Column Value Based on Condition
pdf.withColumn(
    "f", 
    sql_funcs.when(
        pdf.a <= 1, 
        sql_funcs.lit("<=1"),
    ).when(
        (pdf.a > 1) & (pdf.a < 3), 
        sql_funcs.lit("1 < ? < 3"),
    ).otherwise(sql_funcs.lit(">=3"))
).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+-------+----------+-------------------+---------+
|  a|  b|      c|         d|                  e|        f|
+---+---+-------+----------+-------------------+---------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|      <=1|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|1 < ? < 3|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|      >=3|
+---+---+-------+----------+-------------------+---------+

In [78]:
# ``DataFrame.withColumn`` method introduces a projection internally. 
# Therefore, calling it multiple times, for instance, via loops in order to 
# add multiple columns can generate big plans which can cause performance issues 
# and even StackOverflowException. 
# To avoid this, use select() with the multiple columns at once.
pdf.select(
    pdf.a,
    pdf.b,
    pdf.c,
    pdf.d,
    pdf.e,
    (pdf.a * pdf.a).alias("f"),
    (pdf.b * 10).alias("g")
).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+-------+----------+-------------------+---+----+
|  a|  b|      c|         d|                  e|  f|   g|
+---+---+-------+----------+-------------------+---+----+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|  1|20.0|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|  4|30.0|
|  3|4.0|string3|2000-03-01|2000-01-03 12:00:00|  9|40.0|
+---+---+-------+----------+-------------------+---+----+