In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
        .appName("UDF") \
        .getOrCreate()

spark

22/12/22 17:04:00 WARN Utils: Your hostname, Magus resolves to a loopback address: 127.0.1.1; using 172.23.232.161 instead (on interface eth0)
22/12/22 17:04:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/12/22 17:04:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/22 17:04:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

                                                                                

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



## User Defined Function

* used to extend PS build in capabilities
* allows python functions to be run on PS components


In [13]:
# Creating a function

def capitalize_(item: str):
    out_words = [word.capitalize() for word in item.split(" ")]
    return " ".join(out_words)

capitalize_("helllo world")

'Helllo World'

In [14]:
# Convert to PySpark UDF
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

convertUDF = udf(lambda z: capitalize_(z), StringType())


In [15]:
# using udf

df.select(col("Seqno"), convertUDF(col("Name")).alias("Name")).show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |John Jones  |
|2    |Tracey Smith|
|3    |Amy Sanders |
+-----+------------+



In [16]:
# Can use decorator directly to functions

@udf(returnType=StringType())
def upperCase(item: str):
    return item.upper()


df.withColumn("UName", upperCase(col("Name"))).show(truncate=False)


+-----+------------+------------+
|Seqno|Name        |UName       |
+-----+------------+------------+
|1    |john jones  |JOHN JONES  |
|2    |tracey smith|TRACEY SMITH|
|3    |amy sanders |AMY SANDERS |
+-----+------------+------------+

