In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 49.5 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=1c08001ff786f3774d5de9e1d5ddd3f17adad1c91eabb64a7a66774df91d0048
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [21]:
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql.functions import lit

In [3]:
spark = SparkSession.builder.appName("withColumn").getOrCreate()

In [5]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]
column =  ['first_name', 'middle_name', 'last_name', 'dob', 'gender', 'salary']

In [6]:
df = spark.createDataFrame(data, column)

In [7]:
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [11]:
df.show()

+----------+-----------+---------+----------+------+------+
|first_name|middle_name|last_name|       dob|gender|salary|
+----------+-----------+---------+----------+------+------+
|     James|           |    Smith|1991-04-01|     M|  3000|
|   Michael|       Rose|         |2000-05-19|     M|  4000|
|    Robert|           | Williams|1978-09-05|     M|  4000|
|     Maria|       Anne|    Jones|1967-12-01|     F|  4000|
|       Jen|       Mary|    Brown|1980-02-17|     F|    -1|
+----------+-----------+---------+----------+------+------+



**Change DataType using withColumn()**

In [13]:
df.withColumn('salary', df.salary.cast("float")).show()

+----------+-----------+---------+----------+------+------+
|first_name|middle_name|last_name|       dob|gender|salary|
+----------+-----------+---------+----------+------+------+
|     James|           |    Smith|1991-04-01|     M|3000.0|
|   Michael|       Rose|         |2000-05-19|     M|4000.0|
|    Robert|           | Williams|1978-09-05|     M|4000.0|
|     Maria|       Anne|    Jones|1967-12-01|     F|4000.0|
|       Jen|       Mary|    Brown|1980-02-17|     F|  -1.0|
+----------+-----------+---------+----------+------+------+



**Update the value of an existing column**

In [14]:
df.withColumn("salary", df.salary * 100).show()

+----------+-----------+---------+----------+------+------+
|first_name|middle_name|last_name|       dob|gender|salary|
+----------+-----------+---------+----------+------+------+
|     James|           |    Smith|1991-04-01|     M|300000|
|   Michael|       Rose|         |2000-05-19|     M|400000|
|    Robert|           | Williams|1978-09-05|     M|400000|
|     Maria|       Anne|    Jones|1967-12-01|     F|400000|
|       Jen|       Mary|    Brown|1980-02-17|     F|  -100|
+----------+-----------+---------+----------+------+------+



**Create new column from an existing column**

In [17]:
df.withColumn("NewCol", df.salary * 5).show()

+----------+-----------+---------+----------+------+------+------+
|first_name|middle_name|last_name|       dob|gender|salary|NewCol|
+----------+-----------+---------+----------+------+------+------+
|     James|           |    Smith|1991-04-01|     M|  3000| 15000|
|   Michael|       Rose|         |2000-05-19|     M|  4000| 20000|
|    Robert|           | Williams|1978-09-05|     M|  4000| 20000|
|     Maria|       Anne|    Jones|1967-12-01|     F|  4000| 20000|
|       Jen|       Mary|    Brown|1980-02-17|     F|    -1|    -5|
+----------+-----------+---------+----------+------+------+------+



**Add new columns ( lit() used to add a constant value to a DF column)**

In [23]:
df.withColumn("New_Column", lit('Hello')) \
.withColumn("New_Column2", lit(99)).show()

+----------+-----------+---------+----------+------+------+----------+-----------+
|first_name|middle_name|last_name|       dob|gender|salary|New_Column|New_Column2|
+----------+-----------+---------+----------+------+------+----------+-----------+
|     James|           |    Smith|1991-04-01|     M|  3000|     Hello|         99|
|   Michael|       Rose|         |2000-05-19|     M|  4000|     Hello|         99|
|    Robert|           | Williams|1978-09-05|     M|  4000|     Hello|         99|
|     Maria|       Anne|    Jones|1967-12-01|     F|  4000|     Hello|         99|
|       Jen|       Mary|    Brown|1980-02-17|     F|    -1|     Hello|         99|
+----------+-----------+---------+----------+------+------+----------+-----------+



**Change Column name using withColumnRenamed()**

In [29]:
df.withColumnRenamed('salary', "sal").printSchema()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- sal: long (nullable = true)

