### Functions
- string
- numbers
- dates

In [17]:
#Imports and load data
from pyspark.sql import SparkSession
from pyspark.sql import Window as W # Needed for windowing operations
from pyspark.sql.functions import * # Needed for Filters like When, Like etc.


spark = SparkSession.builder.appName("functions").master("local[4]").getOrCreate()
df = spark.read.csv("data/functions.csv",header=True,inferSchema=True)
spark.sparkContext.setLogLevel("ERROR")

In [18]:
df.printSchema()

root
 |-- string: string (nullable = true)
 |-- string2: string (nullable = true)
 |-- integer: integer (nullable = true)
 |-- float: double (nullable = true)
 |-- date: integer (nullable = true)



In [19]:
# Contains - col.contains(string)
df.filter(df.string.contains('bla')).show()

+------+-------+-------+-----+----+
|string|string2|integer|float|date|
+------+-------+-------+-----+----+
+------+-------+-------+-----+----+



In [20]:
# Starts With / ends with - col.startswith(string)
df.filter(df.string.startswith('Jo')).show()
# df = df.filter(df.string.endswith('den'))

+------+-----------+-------+-----+----------+
|string|    string2|integer|float|      date|
+------+-----------+-------+-----+----------+
|Jorden|van Foreest|    234|  6.3|1673919447|
+------+-----------+-------+-----+----------+



In [21]:
# Is Null - col.isNull()
df.filter(df.string.isNull()).show()
# df.filter(df.string.isNotNull())


+------+-------+-------+-----+----+
|string|string2|integer|float|date|
+------+-------+-------+-----+----+
+------+-------+-------+-----+----+



In [25]:
#Like - col.like(string_with_sql_wildcards)
df.filter(df.string.like('%ord%')).show()



+------+-----------+-------+-----+----------+
|string|    string2|integer|float|      date|
+------+-----------+-------+-----+----------+
|Jorden|van Foreest|    234|  6.3|1673919447|
+------+-----------+-------+-----+----------+



In [27]:
# Regex Like - col.rlike(regex)
df.filter(df.string.rlike('[A-Z]*den$')).show()


+------+-----------+-------+-----+----------+
|string|    string2|integer|float|      date|
+------+-----------+-------+-----+----------+
|Jorden|van Foreest|    234|  6.3|1673919447|
+------+-----------+-------+-----+----------+



In [28]:
# Is In List - col.isin(*cols)
df.filter(df.string.isin('Jorden', 'Iwan')).show()

+------+-----------+-------+-----+----------+
|string|    string2|integer|float|      date|
+------+-----------+-------+-----+----------+
|Jorden|van Foreest|    234|  6.3|1673919447|
+------+-----------+-------+-----+----------+



In [31]:
# Concatenate - F.concat(*cols)
df.withColumn('concat', concat('string', lit(' '), 'string2')).show()

+------+-----------+-------+-----+----------+------------------+
|string|    string2|integer|float|      date|            concat|
+------+-----------+-------+-----+----------+------------------+
|Jorden|van Foreest|    234|  6.3|1673919447|Jorden van Foreest|
+------+-----------+-------+-----+----------+------------------+



In [35]:
# Round
df.withColumn('rounded', round('float', 0)).show()
# df.withColumn('floor', floor('float')).show()
# df.withColumn('ciel', ciel('float')).show()



+------+-----------+-------+-----+----------+-------+
|string|    string2|integer|float|      date|rounded|
+------+-----------+-------+-----+----------+-------+
|Jorden|van Foreest|    234|  6.3|1673919447|    6.0|
+------+-----------+-------+-----+----------+-------+



In [36]:
# Select smallest value out of multiple columns – F.least(*cols)
df.withColumn('least', least('integer', 'float')).show()
# df.withColumn('greatest', greatest('integer', 'float')).show()


+------+-----------+-------+-----+----------+-----+
|string|    string2|integer|float|      date|least|
+------+-----------+-------+-----+----------+-----+
|Jorden|van Foreest|    234|  6.3|1673919447|  6.3|
+------+-----------+-------+-----+----------+-----+

