# Using UDF functions registration

In [0]:
data = [("manoj", 23, 1000, 2000), ("sneha", 24, 2000, 1000)]
col = ["name", "age", "salary", "bonus"]

udf_df = spark.createDataFrame(data=data, schema=col)
udf_df.display()
udf_df.printSchema()

name,age,salary,bonus
manoj,23,1000,2000
sneha,24,2000,1000


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- bonus: long (nullable = true)



In [0]:
def total_pay(a, b):
    return a * b

from pyspark.sql.functions import udf
help(udf)

Help on function udf in module pyspark.sql.functions:

udf(f: Union[Callable[..., Any], ForwardRef('DataTypeOrString'), NoneType] = None, returnType: 'DataTypeOrString' = StringType()) -> Union[ForwardRef('UserDefinedFunctionLike'), Callable[[Callable[..., Any]], ForwardRef('UserDefinedFunctionLike')]]
    Creates a user defined function (UDF).
    
    .. versionadded:: 1.3.0
    
    .. versionchanged:: 3.4.0
        Support Spark Connect.
    
    Parameters
    ----------
    f : function
        python function if used as a standalone function
    returnType : :class:`pyspark.sql.types.DataType` or str
        the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.
    
    Examples
    --------
    >>> from pyspark.sql.types import IntegerType
    >>> slen = udf(lambda s: len(s), IntegerType())
    >>> @udf
    ... def to_upper(s):
    ...     if s is not None:
    ...         retur

In [0]:
from pyspark.sql.types import IntegerType
total_payment = udf(lambda a, b:total_pay(a, b), IntegerType())

In [0]:
udf_df.withColumn("total_pay", total_payment(udf_df.salary, udf_df.bonus)).display()
# udf_df.withColumn("total_pay", total_payment(col("salary"), col("bonus"))).display()

name,age,salary,bonus,total_pay
manoj,23,1000,2000,2000000
sneha,24,2000,1000,2000000


# Using UDF Annotion

In [0]:
# that means rather than register UDF just define annotation for that and use it
data = [("Alica", 23), ("manoj", 23), ("sneha", 24), ("anju", 25)]
col = ["name", "age"]
udf_ant_df = spark.createDataFrame(data=data, schema=col)
udf_ant_df.display()

name,age
Alica,23
manoj,23
sneha,24
anju,25


In [0]:
@udf(returnType=IntegerType())
def mul_two(x):
    return x*2

udf_ant_df.select(udf_ant_df.name, mul_two(udf_ant_df.age)).display()

name,mul_two(age)
Alica,46
manoj,46
sneha,48
anju,50


# Using UDF SQL

In [0]:
data = [("Alica", 23), ("manoj", 23), ("sneha", 24), ("anju", 25)]
col = ["name", "age"]
udf_sql_df = spark.createDataFrame(data=data, schema=col)
udf_sql_df.display()

name,age
Alica,23
manoj,23
sneha,24
anju,25


In [0]:
udf_sql_df.createOrReplaceTempView("sql_udf")

In [0]:
%sql select * from sql_udf

name,age
Alica,23
manoj,23
sneha,24
anju,25


In [0]:
def add_one(a):
    return a+1

spark.udf.register(name="plus_one_age", f=add_one, returnType=IntegerType())

Out[37]: <function __main__.add_one(a)>

In [0]:
%sql  select name, plus_one_age(age) from  sql_udf

name,plus_one_age(age)
Alica,24
manoj,24
sneha,25
anju,26
