In [0]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

def convertCase(str):
    resStr=""
    arr = str.split(" ")
    for x in arr:
       resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
    return resStr 

""" Converting function to UDF """
convertUDF = udf(lambda z: convertCase(z))

df.select(col("Seqno"), \
    convertUDF(col("Name")).alias("Name") ) \
.show(truncate=False)


@udf(returnType=StringType()) 
def upperCase(str):
    return str.upper()

upperCaseUDF = udf(lambda z:upperCase(z),StringType())    

df.withColumn("Cureated Name", upperCase(col("Name"))) \
.show(truncate=False)

""" Using UDF on SQL """
spark.udf.register("convertUDF", convertCase,StringType())
df.createOrReplaceTempView("NAME_TABLE")
spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE") \
     .show(truncate=False)
     
spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE " + \
          "where Name is not null and convertUDF(Name) like '%John%'") \
     .show(truncate=False)  
     
""" null check """

columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders"),
    ('4',None)]

df2 = spark.createDataFrame(data=data,schema=columns)
df2.show(truncate=False)
df2.createOrReplaceTempView("NAME_TABLE2")
    
spark.udf.register("_nullsafeUDF", lambda str: convertCase(str) if not str is None else "" , StringType())

spark.sql("select _nullsafeUDF(Name) from NAME_TABLE2") \
     .show(truncate=False)

spark.sql("select Seqno, _nullsafeUDF(Name) as Name from NAME_TABLE2 " + \
          " where Name is not null and _nullsafeUDF(Name) like '%John%'") \
     .show(truncate=False)  




+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+

+-----+------------+-------------+
|Seqno|Name        |Cureated Name|
+-----+------------+-------------+
|1    |john jones  |JOHN JONES   |
|2    |tracey smith|TRACEY SMITH |
|3    |amy sanders |AMY SANDERS  |
+-----+------------+-------------+

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+

+-----+-----------+
|Seqno|Name       |
+-----+-----------+
|1    |John Jones |
+-----+-----------+

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
|4    |null        |
+-----+------------+

+------------------+
|_nul

In [0]:
#The code imports necessary modules, including SparkSession and functions like col and udf from pyspark.sql, as well as the StringType data type.

#A SparkSession is created with the application name set to 'SparkByExamples.com'.

#Data is created as a list of tuples representing individuals' information.

#A DataFrame df is created using the given data and column names.

#The df DataFrame is displayed using the show() method.

#A function named convertCase is defined to convert the case of a string.

#The convertUDF is created using the udf() function and the convertCase function.

#The df DataFrame is selected with the "Seqno" column and the transformed "Name" column using the convertUDF and displayed.

#Another UDF named upperCase is defined to convert a string to uppercase.

#The upperCaseUDF is created using the udf() function and the upperCase function.

#The df DataFrame is transformed by adding a new column "Cureated Name" with uppercase values using the upperCaseUDF and displayed.

#The convertCase function is registered as a UDF named "convertUDF" using spark.udf.register().

#The df DataFrame is temporarily registered as "NAME_TABLE" using createOrReplaceTempView().

#A SQL query is executed using spark.sql() to select the "Seqno" column and apply the UDF "convertUDF" to the "Name" column.

#Another SQL query is executed to filter the names containing "John" using the UDF "convertUDF".

#Another DataFrame df2 is created with an additional row containing None for the "Name" column.

#The df2 DataFrame is displayed.

#The df2 DataFrame is temporarily registered as "NAME_TABLE2".

#A lambda function is defined as a UDF named "_nullsafeUDF" to handle null values in the "convertCase" function.

#The "_nullsafeUDF" is registered using spark.udf.register().

#A SQL query is executed to apply the "_nullsafeUDF" to the "Name" column of the "NAME_TABLE2", handling null values.

#Another SQL query is executed to filter the names containing "John" while handling null values.

#Overall, this code demonstrates the creation and usage of UDFs in PySpark to perform custom transformations on DataFrame columns and handle null values.
