In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [("James","M",60000), ("Michael","M",70000),
        ("Robert",None,400000), ("Maria","F",500000),
        ("Jen","",None)]

columns = ["name","gender","salary"]
df = spark.createDataFrame(data = data, schema = columns)
df.show()

#Using When otherwise
from pyspark.sql.functions import when,col
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
                                 .when(df.gender == "F","Female")
                                 .when(df.gender.isNull() ,"")
                                 .otherwise(df.gender))
df2.show()
df2 = df.withColumn("new_gender", when(df.gender == "M","Male")
                                 .when(df.gender == "F","Female")
                                 .when(df.gender.isNull() ,"")
                                 .otherwise(df.gender))

df2=df.select(col("*"),when(df.gender == "M","Male")
                  .when(df.gender == "F","Female")
                  .when(df.gender.isNull() ,"")
                  .otherwise(df.gender).alias("new_gender"))
df2.show()
# Using SQL Case When
from pyspark.sql.functions import expr
df3 = df.withColumn("new_gender", expr("CASE WHEN gender = 'M' THEN 'Male' " + 
           "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
          "ELSE gender END"))
df3.show()

df4 = df.select(col("*"), expr("CASE WHEN gender = 'M' THEN 'Male' " +
           "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
           "ELSE gender END").alias("new_gender"))

df.createOrReplaceTempView("EMP")
spark.sql("select name, CASE WHEN gender = 'M' THEN 'Male' " + 
               "WHEN gender = 'F' THEN 'Female' WHEN gender IS NULL THEN ''" +
              "ELSE gender END as new_gender from EMP").show

+-------+------+------+
|   name|gender|salary|
+-------+------+------+
|  James|     M| 60000|
|Michael|     M| 70000|
| Robert|  null|400000|
|  Maria|     F|500000|
|    Jen|      |  null|
+-------+------+------+

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|     M| 70000|      Male|
| Robert|  null|400000|          |
|  Maria|     F|500000|    Female|
|    Jen|      |  null|          |
+-------+------+------+----------+

+-------+------+------+----------+
|   name|gender|salary|new_gender|
+-------+------+------+----------+
|  James|     M| 60000|      Male|
|Michael|  

In [0]:
#The code starts by importing the necessary modules: SparkSession from pyspark.sql and when, col, and expr functions from pyspark.sql.functions. These modules are required for creating a SparkSession, working with Spark DataFrames, and performing conditional transformations.

#A SparkSession is created using the SparkSession.builder API. The appName parameter sets the name of the Spark application. If an existing SparkSession with the same name exists, it will be retrieved; otherwise, a new SparkSession will be created.

#Sample data is defined as a list of tuples. Each tuple represents a row of data, with elements corresponding to the columns: name, gender, and salary.

#The schema of the DataFrame is defined using the schema parameter when creating the DataFrame using spark.createDataFrame(data, schema). This ensures that the columns have the correct data types.

#The contents of the DataFrame are displayed using df.show(), which shows all the rows and columns.

#The code demonstrates different approaches to perform conditional transformations on the gender column and create a new column new_gender based on specific conditions.

#Approach 1: Using when() function chained with otherwise() function
#Approach 2: Using withColumn() and when() function
#Approach 3: Using expr() function with SQL-like CASE WHEN statement
#Approach 4: Using SQL syntax with select() and alias()
#The resulting DataFrames with the new new_gender column are displayed using df2.show() and df3.show().

#Lastly, the code demonstrates how to perform the same conditional transformation using Spark SQL. It creates a temporary view named "EMP" with df.createOrReplaceTempView("EMP") and then executes a SQL query on the temporary view using spark.sql(). The resulting DataFrame is displayed using .show().

#These different approaches allow for flexible conditional transformations on Spark DataFrames, providing options to derive new columns based on specific conditions.
