In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql import SparkSession

In [2]:
schema = StructType([
    StructField('name', StructType([
        StructField('firstname', StringType(), True),
        StructField('middlename', StringType(), True),
        StructField('lastname', StringType(), True)
    ])),
    StructField('dob', StringType(), True),
    StructField('gender', StringType(), True),
    StructField('salary', IntegerType(), True)
])

dataDF = [(('James', '', 'Smith'), '1991-04-01', 'M', 3000),
          (('Michael', 'Rose', ''), '2000-05-19', 'M', 4000),
          (('Robert', '', 'Williams'), '1978-09-05', 'M', 4000),
          (('Maria', 'Anne', 'Jones'), '1967-12-01', 'F', 4000),
          (('Jen', 'Mary', 'Brown'), '1980-02-17', 'F', -1)
          ]

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
df = spark.createDataFrame(data=dataDF, schema=schema)
df.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



# Rename single non-nested column

In [3]:
print('Rename single non-nested column')
df.withColumnRenamed("dob", "DateOfBirth").printSchema()

Rename single non-nested column
root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



# Rename multiple columns with chaining the command

In [4]:
print('Rename multiple columns with chaining the command')
df2 = df.withColumnRenamed("dob", "DateOfBirth").withColumnRenamed("salary", "salary_amount")
df2.printSchema()

Rename multiple columns with chaining the command
root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary_amount: integer (nullable = true)



# Rename nested column

## Create new schema

In [8]:
print('Create new schema and update in existing DataFrame')
schema2 = StructType([
    StructField("fname", StringType()),
    StructField("middlename", StringType()),
    StructField("lname", StringType())])

df.select(col("name").cast(schema2), col("dob"), col("gender"), col("salary")).printSchema()

Create new schema and update in existing DataFrame
root
 |-- name: struct (nullable = true)
 |    |-- fname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



## Using select to rename nested columns
<span style="color:red">Will remove nested structure</span>

In [12]:
print('Using select to rename nested columns')
df.select(col('name.firstname').alias('fname'),
          col('name.middlename').alias('mname'),
          col('name.lastname').alias('lname'),
          col('dob'), col('gender'), col('salary')).printSchema()


Using select to rename nested columns
root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



# Using DataFrame withColumn to rename nested columns
<span style="color:red">Will remove nested structure</span>

In [16]:
print('Using DataFrame withColumn to rename nested columns. Will remove nested structure')
df4 = df.withColumn('fname', col('name.firstname')) \
    .withColumn('mname', col('name.middlename')) \
    .withColumn('lname', col('name.lastname')) \
    .drop('name')
df4.printSchema()

Using DataFrame withColumn to rename nested columns. Will remove nested structure
root
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)



# Using toDF() to change all columns in a PySpark DataFrame

In [17]:
print('Using toDF() to change all columns in a PySpark DataFrame')
newColumns = ["newCol1", "newCol2", "newCol3", "newCol4"]
df.toDF(*newColumns).printSchema()

Using toDF() to change all columns in a PySpark DataFrame
root
 |-- newCol1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- newCol2: string (nullable = true)
 |-- newCol3: string (nullable = true)
 |-- newCol4: integer (nullable = true)

