In [0]:
#import pyspark.sql.functions as fn
from pyspark.sql import functions as fn, types as T
from pyspark.sql import SparkSession
import functools

In [0]:
data = [
  {'integers': 1},
  {'integers': 2},
  {'integers': 3},
  {'integers': 4},
  {'integers': 5}
]
# spark.read.json deals with files or RDDs sop convert to RDD
data_rdd = sc.parallelize([data])
df = spark.read.option('multiline', "true").json(data_rdd)
display(df)

integers
1
2
3
4
5


In [0]:
def multiply_by_two(number):
  return number * 2

udf_multiply_by_two = fn.udf(multiply_by_two, T.IntegerType())

display(df.withColumn("times_two", udf_multiply_by_two("integers")))

integers,times_two
1,2
2,4
3,6
4,8
5,10


In [0]:
@udf(T.IntegerType())
def decorator_multiply_by_three(number):
  return number * 3

display(df.withColumn("decorator_times_three", decorator_multiply_by_three("integers")))

integers,decorator_times_three
1,3
2,6
3,9
4,12
5,15


In [0]:
def curry_multiply_by_n(n):
  def multiply_by_n(number):
    return number * n
  return multiply_by_n

multiply_by_four = curry_multiply_by_n(4)

udf_multiply_by_four = fn.udf(multiply_by_four, T.IntegerType())

display(df.withColumn("decorator_times_four", udf_multiply_by_four("integers")))

integers,decorator_times_four
1,4
2,8
3,12
4,16
5,20


In [0]:
def curry_value(num_times):
  @udf(T.IntegerType())
  def decorator_multiply_by_n(number):
    return number * num_times
  return decorator_multiply_by_n
      
curried_decorator = curry_value(5)
      
display(df.withColumn("decorators_times_five", curried_decorator("integers")))

integers,decorators_times_five
1,5
2,10
3,15
4,20
5,25


In [0]:
def curried_times_n(num_times):
  @udf(T.IntegerType())
  def decorator_multiply_by_n(number):
    return number * num_times
  return decorator_multiply_by_n

# note the double brackets after curry_value allowing parameterisation
display(df.withColumn("decorators_times_six", curried_times_n(6)("integers")))

integers,decorators_times_six
1,6
2,12
3,18
4,24
5,30


In [0]:
# note that this requires functools
def repeat(num_times):
    def decorator_repeat(func):
        @functools.wraps(func)
        def wrapper_repeat(*args, **kwargs):
            # note multiplication is done here now
            value = func(*args, **kwargs) * num_times
            return value
        return wrapper_repeat
    return decorator_repeat
  
@repeat(7)
@udf(T.IntegerType())
def decorator_multiply_by_n(number):
  # note that we just return number here since the wraper does the multiplication
  return number
      
display(df.withColumn("decorators_times_seven", decorator_multiply_by_n("integers")))

integers,decorators_times_seven
1,7
2,14
3,21
4,28
5,35


In [0]:
# note that the repeat decorator is NOT compatible with allowing the
# multiplication factor to be varied when the function is called
@repeat()
@udf(T.IntegerType())
def decorator_multiply_by_n(number):
  return number
      
display(df.withColumn("decorators_times_eight", decorator_multiply_by_n(8)("integers")))
# gives an error. So standard currying is preferable