In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType, IntegerType

In [2]:
spark = SparkSession.builder.appName("Spark Chap5 Parctice").enableHiveSupport().getOrCreate()

#### Spark SQL UDFs

In [8]:
def cubed(s):
    return s * s * s

In [9]:
spark.udf.register("cubed",cubed, LongType())

<function __main__.cubed(s)>

In [10]:
spark.range(1,9).createOrReplaceTempView("udf_test")

In [11]:
spark.sql("SELECT id,cubed(id) as id_cubed FROM udf_test").show()

+---+--------+
| id|id_cubed|
+---+--------+
|  1|       1|
|  2|       8|
|  3|      27|
|  4|      64|
|  5|     125|
|  6|     216|
|  7|     343|
|  8|     512|
+---+--------+



##### Evaluation order and null checking in Spark SQL

In [21]:
def strlen(s):
    return len(s)

# spark.udf.register("strlen", lambda s: len(s), "int") #Alternate 
spark.udf.register("strlen", strlen, IntegerType())

<function __main__.strlen(s)>

In [30]:
spark.udf.register("strlen_nullsafe", lambda s: len(s) if not s is None else -1, "int")
# spark.sql("select s from test1 where s is not null and strlen_nullsafe(s) > 1") // ok
# spark.sql("select s from test1 where if(s is not null, strlen(s), null) > 1")   // ok

<function __main__.<lambda>(s)>

#### Speeding up and distributing PySpark UDFs with Pandas UDFs

In [32]:
# Pandas udfs over PySpark udfs because, 
# PySpark udfs required data movement btwn JVM and Python,
# which is quite expensive.

# To resolve this Pandas UDFs (also known as vectorized UDFs) were introduced
# Panda UDFs uses Apache Arrow to transfer data and Pandas to work with data.


In [None]:
# Pandas UDfs were split into two API categories
# 1. Pandas UDFs
# 2. Pandas Function APIs

In [40]:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

In [41]:
def cubed(a:pd.Series) -> pd.Series:
    return a * a * a

In [42]:
cubed_udf = pandas_udf(cubed, returnType=LongType())