In [13]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType, LongType
from pyspark.sql.functions import udf
from pyspark.sql import Row

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark_UDF_LAMBDA") \
    .config('spark.ui.port', '4050')\
    .getOrCreate()

## Exemplo 1

In [14]:
def make_squared_typed_long(number):
    return number * number
spark.udf.register("squaredWithPython", make_squared_typed_long, LongType())

<function __main__.make_squared_typed_long(number)>

In [15]:
spark.range(1, 20).createOrReplaceTempView("numbers_temp_table")
numbers_dataframe = spark.table("numbers_temp_table")
numbers_dataframe.show(truncate=False)

+---+
|id |
+---+
|1  |
|2  |
|3  |
|4  |
|5  |
|6  |
|7  |
|8  |
|9  |
|10 |
|11 |
|12 |
|13 |
|14 |
|15 |
|16 |
|17 |
|18 |
|19 |
+---+



In [16]:
from pyspark.sql.functions import udf
@udf("long")
def make_squared_typed_long(number):
    return number * number
numbers_dataframe = spark.table("numbers_temp_table")
numbers_dataframe.select("id", make_squared_typed_long("id").alias("id_squared")).show(truncate=False)

+---+----------+
|id |id_squared|
+---+----------+
|1  |1         |
|2  |4         |
|3  |9         |
|4  |16        |
|5  |25        |
|6  |36        |
|7  |49        |
|8  |64        |
|9  |81        |
|10 |100       |
|11 |121       |
|12 |144       |
|13 |169       |
|14 |196       |
|15 |225       |
|16 |256       |
|17 |289       |
|18 |324       |
|19 |361       |
+---+----------+



## Python Lambda Functions

In [17]:
lista_numeros = [2, 10, 20, 54, 9]
numero_quadrado = map(lambda numero: numero ** 2, lista_numeros)
print(list(numero_quadrado))

[4, 100, 400, 2916, 81]


In [18]:
def numero_quadrado(lista_numeros):
    nova_lista = []
    for i in lista_numeros:
        quadrado = i ** 2
        nova_lista.append(quadrado)
    return nova_lista

In [19]:
numero_quadrado_2 = numero_quadrado(lista_numeros)
print(numero_quadrado_2)

[4, 100, 400, 2916, 81]


# Exemplo 2

In [30]:
schema = StructType([
    StructField("sales", FloatType(),True),    
    StructField("employee", StringType(),True),
    StructField("ID", IntegerType(),True)
])

sales_data = [[ 10.2, "Fred",123], [50.35, "Barney", 200]]

sales_dataframe = spark.createDataFrame(sales_data,schema=schema)

cols_to_int = udf(lambda z: to_int(z), IntegerType())
spark.udf.register("cols_to_int", cols_to_int)

def to_int(number):
    
    if isinstance(number, str) == True:
        converted = [str(ord(i)) for i in number]
        return(int(''.join(converted)))
    else:
         return Null


sales_dataframe_2 = sales_dataframe.withColumn( 'employee_converted',cols_to_int('employee'))

In [31]:
sales_dataframe_2.show()

+-----+--------+---+------------------+
|sales|employee| ID|employee_converted|
+-----+--------+---+------------------+
| 10.2|    Fred|123|        1394624364|
|50.35|  Barney|200|        1670219393|
+-----+--------+---+------------------+



Obs.: The ord() function in Python accepts a string of length 1 as an argument and returns the unicode code point representation of the passed argument. For example ord('B') returns 66 which is a unicode code point value of character 'B'

# Referências

1. https://docs.databricks.com/spark/latest/spark-sql/udf-python.html
2. https://www.bmc.com/blogs/how-to-write-spark-udf-python/