In [0]:
import pandas as pd
from pyspark.sql.functions import col, lit, coalesce, when

In [0]:
# create a simple dataframe
data = {
  'col_1': [1.0, 2.0, None, None, None],
  'col_2': [2.0, 2.0, None, None, None],
  'col_3': [3.0, None, 3.0, 4.0, None],
}

# this workbook runs on databricks so the spark context is already created
df = spark.createDataFrame(pd.DataFrame(data = data))
display(df)

col_1,col_2,col_3
1.0,2.0,3.0
2.0,2.0,
,,3.0
,,4.0
,,


In [0]:
# a simple example of coallesce statements

df_simple_coalesce = df.withColumn(
  'coalesced', 
  # a simple coalesce
  coalesce(
    col('col_1'), 
    col('col_2'), 
    col('col_3'), 
  )
).withColumn(
  'coalesced_with_default', 
  # note the use of an ending literal to ensure no nulls
  coalesce(
    col('col_1'), 
    col('col_2'), 
    col('col_3'), 
    lit(5),
  )
)

# note the null vs the 5 in line 5 dependant on a catchall literal
display(df_simple_coalesce)

col_1,col_2,col_3,coalesced,coalesced_with_default
1.0,2.0,3.0,1.0,1.0
2.0,2.0,,2.0,2.0
,,3.0,3.0,3.0
,,4.0,4.0,4.0
,,,,5.0


In [0]:
# a coalesce statement used in combination with conditional when statement
df_when_coalesce = df.withColumn(
  'coalesced_when', 
  coalesce(
    # no otherwise so nulls preserved
    when(col('col_1') > 1, 5), 
    col('col_2'), 
    col('col_3')
  )
).withColumn(
  'coalesced_when_with_otherwise', 
  # otherwise overwrites all nulls in col_1 
  coalesce(
    when(col('col_1') > 1, 5).otherwise(6), 
    col('col_2'), 
    col('col_3')
  )
)

# note the value of 5 in line 2
display(df_when_coalesce)

col_1,col_2,col_3,coalesced_when,coalesced_when_with_otherwise
1.0,2.0,3.0,2.0,6.0
2.0,2.0,,5.0,5.0
,,3.0,3.0,6.0
,,4.0,4.0,6.0
,,,,6.0


In [0]:
# example of  using a list comprehension to compile whens
columns = [1,2,3]
whens = [when(col(f'col_{c}') == c, 0).otherwise(col(f'col_{c}')) for c in columns]

# we can also add a catchall condition to the list
whens += [lit(-1)]

df_multiple_whens = df.withColumn(
  'coalesced_whens_from_list', 
  coalesce(
    # use star syntax to apply our list of conditions in order and coalesce
    *whens
  )
)

# note that row 2  gives 2 due to the first term where col_1 has a value of 2
display(df_multiple_whens)

col_1,col_2,col_3,coalesced_whens_from_list
1.0,2.0,3.0,0.0
2.0,2.0,,2.0
,,3.0,0.0
,,4.0,4.0
,,,-1.0
