In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

data = [('James','Smith','M',3000),
  ('Anna','Rose','F',4100),
  ('Robert','Williams','M',6200), 
]

columns = ["firstname","lastname","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
df.show()


if 'salary1' not in df.columns:
    print("aa")
    
# Add new constanct column
from pyspark.sql.functions import lit
df.withColumn("bonus_percent", lit(0.3)) \
  .show()
  
#Add column from existing column
df.withColumn("bonus_amount", df.salary*0.3) \
  .show()

#Add column by concatinating existing columns
from pyspark.sql.functions import concat_ws
df.withColumn("name", concat_ws(",","firstname",'lastname')) \
  .show()

#Add current date
from pyspark.sql.functions import current_date
df.withColumn("current_date", current_date()) \
  .show()


from pyspark.sql.functions import when
df.withColumn("grade", \
   when((df.salary < 4000), lit("A")) \
     .when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \
     .otherwise(lit("C")) \
  ).show()
    
# Add column using select
df.select("firstname","salary", lit(0.3).alias("bonus")).show()
df.select("firstname","salary", lit(df.salary * 0.3).alias("bonus_amount")).show()
df.select("firstname","salary", current_date().alias("today_date")).show()

#Add columns using SQL
df.createOrReplaceTempView("PER")
spark.sql("select firstname,salary, '0.3' as bonus from PER").show()
spark.sql("select firstname,salary, salary * 0.3 as bonus_amount from PER").show()
spark.sql("select firstname,salary, current_date() as today_date from PER").show()
spark.sql("select firstname,salary, " +
          "case salary when salary < 4000 then 'A' "+
          "else 'B' END as grade from PER").show()






+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|     Anna|    Rose|     F|  4100|
|   Robert|Williams|     M|  6200|
+---------+--------+------+------+

aa
+---------+--------+------+------+-------------+
|firstname|lastname|gender|salary|bonus_percent|
+---------+--------+------+------+-------------+
|    James|   Smith|     M|  3000|          0.3|
|     Anna|    Rose|     F|  4100|          0.3|
|   Robert|Williams|     M|  6200|          0.3|
+---------+--------+------+------+-------------+

+---------+--------+------+------+------------+
|firstname|lastname|gender|salary|bonus_amount|
+---------+--------+------+------+------------+
|    James|   Smith|     M|  3000|       900.0|
|     Anna|    Rose|     F|  4100|      1230.0|
|   Robert|Williams|     M|  6200|      1860.0|
+---------+--------+------+------+------------+

+---------+--------+------+------+---------------+
|firstname|lastname

In [0]:
#df.withColumn("bonus_percent", lit(0.3)) \                  #adds a new column and lit(0.3) provides a literal value

#df.withColumn("name", concat_ws(",","firstname",'lastname')) \                               #concate with separator

#.when((df.salary >= 4000) & (df.salary <= 5000), lit("B")) \                           #create new column and check the condition


In [0]:
#Import the necessary libraries, including SparkSession from pyspark.sql.
#Create a Spark session using SparkSession.builder.appName('SparkByExamples.com').getOrCreate().
#Define a list of tuples data representing the data for the DataFrame.
#Define the column names as a list columns.
#Create a PySpark DataFrame df using spark.createDataFrame(data=data, schema=columns).
#Show the contents of the DataFrame using df.show().
#Check if a column named 'salary1' exists in the DataFrame using an if condition and df.columns.
#Add a new constant column 'bonus_percent' with a fixed value of 0.3 to the DataFrame using lit() and df.withColumn().
#Show the DataFrame with the newly added column using df.show().
#Add a new column 'bonus_amount' by performing a calculation based on an existing column ('salary') using df.withColumn().
#Show the DataFrame with the newly added column using df.show().
#Add a new column 'name' by concatenating two existing columns ('firstname' and 'lastname') using concat_ws() and df.withColumn().
#Show the DataFrame with the newly added column using df.show().
#Add a new column 'current_date' with the current date using current_date() and df.withColumn().
#Show the DataFrame with the newly added column using df.show().
#Add a new column 'grade' based on conditions using when(), otherwise(), and df.withColumn().
#Show the DataFrame with the newly added column using df.show().
#Add new columns using select() with lit() and column operations.
#Show the selected columns with the newly added columns using df.show().
#Add new columns using SQL queries on the DataFrame by creating a temporary view using createOrReplaceTempView() and querying the view with spark.sql().
#Show the SQL query results with the newly added columns using df.show().
#The code demonstrates different methods to add columns to a PySpark DataFrame, including adding constant columns, performing calculations on existing columns, concatenating columns, adding the current date, and using SQL queries.
