# Column APIs

In [5]:
# initialize spark session
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pprint import pprint

# Create SparkContext
sparkContext = SparkContext.getOrCreate()
# Create Glue Context
glueContext = GlueContext(sparkContext)
# Get spark session
spark = glueContext.spark_session

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## UDF (User Defined Function)

In [12]:
# Create sample dataset
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

df = spark.createDataFrame(
    [
        (1, 1, 1),
        (2, 2, 2),
        (3, 3, 3),
    ], 
    ("id", "v1", "v2")
)
df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+---+---+
| id| v1| v2|
+---+---+---+
|  1|  1|  1|
|  2|  2|  2|
|  3|  3|  3|
+---+---+---+

In [13]:
# Define a UDF
# ref: https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.udf.html#pyspark.sql.functions.udf
udf_x_100 = udf(
    f=lambda x: x * 100, 
    returnType=IntegerType(),
)

# Apply UDF to column
df_v1_v2_x_100 = df.select(
    df.id, 
    udf_x_100(df.v1).alias("v1_x_100"),
    udf_x_100(df.v2).alias("v2_x_100"),
)

df_v1_v2_x_100.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------+--------+
| id|v1_x_100|v2_x_100|
+---+--------+--------+
|  1|     100|     100|
|  2|     200|     200|
|  3|     300|     300|
+---+--------+--------+

In [14]:
# Define a UDF
udf_add_then_x_100 = udf(
    f=lambda x, y: (x + y) * 100,
    returnType=IntegerType(),
)

# Apply UDF to column
df_add_then_x_100 = df.select(
    df.id,
    udf_add_then_x_100(df.v1, df.v2).alias("add_v1_v2_then_x_100")
)

df_add_then_x_100.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---+--------------------+
| id|add_v1_v2_then_x_100|
+---+--------------------+
|  1|                 200|
|  2|                 400|
|  3|                 600|
+---+--------------------+

In [None]:
# Define a 
def f(x):
    return (x+1)

max_udf=udf(lambda x,y: max(x,y), IntegerType())
f_udf=udf(f, IntegerType())

df2=df.withColumn("result", max_udf(f_udf(df.col1),f_udf(df.col2)))