In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

data = [('James','Smith','M',3000),('Anna','Rose','F',4100),
  ('Robert','Williams','NA',6200),(None,'Rob','F',6200)
  
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()

df2=df.withColumn("salary", df.salary*3)
df2.show()

from pyspark.sql.functions import when
df3 = df.withColumn("gender", when(df.gender == "M","Male") \
      .when(df.gender == "F","Female") \
      .otherwise(df.gender))
df3.show()

df4=df.withColumn("salary",df.salary.cast("String"))
df4.printSchema()

df.createOrReplaceTempView("PER")
df5=spark.sql("select firstname,gender,salary*3 as salary from PER")
df5.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|     Anna|    Rose|     F|  4100|
|   Robert|Williams|    NA|  6200|
|     null|     Rob|     F|  6200|
+---------+--------+------+------+

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  9000|
|     Anna|    Rose|     F| 12300|
|   Robert|Williams|    NA| 18600|
|     null|     Rob|     F| 18600|
+---------+--------+------+------+

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|  Male|  3000|
|     Anna|    Rose|Female|  4100|
|   Robert|Williams|    NA|  6200|
|     null|     Rob|Female|  6200|
+---------+--------+------+------+

root
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = t

In [0]:
#The code imports the necessary modules, including SparkSession from pyspark.sql.

#A SparkSession is created with the application name set to 'SparkByExamples.com'.

#The data list contains tuples representing input data for the DataFrame.

#The columns list defines the column names for the DataFrame.

#The DataFrame df is created using spark.createDataFrame() by passing the input data and schema.

#The contents of df are displayed using show().

#The DataFrame df2 is created by multiplying the "salary" column in df by 3 using withColumn(). The resulting DataFrame is displayed using show().

#The when() function from pyspark.sql.functions is used to conditionally update the "gender" column in df. If "M" is encountered, it is replaced with "Male", if "F" is encountered, it is replaced with "Female", and for other values, the original value is retained. The resulting DataFrame is assigned to df3 and displayed using show().

#The DataFrame df4 is created by changing the data type of the "salary" column in df to String using withColumn() and cast(). The schema of df4 is printed using printSchema().

#The DataFrame df is registered as a temporary view named "PER" using createOrReplaceTempView().

#A SQL query is executed using spark.sql() to select the "firstname", "gender", and calculated "salary" (salary multiplied by 3) from the "PER" view. The result is assigned to df5 and displayed using show().