In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


In [0]:
dataDF = [(('James','','Smith'),'1991-04-01','M',3000),
  (('Michael','Rose',''),'2000-05-19','M',4000),
  (('Robert','','Williams'),'1978-09-05','M',4000),
  (('Maria','Anne','Jones'),'1967-12-01','F',4000),
  (('Jen','Mary','Brown'),'1980-02-17','F',-1)
]


from pyspark.sql.types import StructType,StructField, StringType, IntegerType

schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()
df.show()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+



In [0]:
""" withColumnRenamed(existingName, newNam)
    existingName – The existing column name you want to change  
    newName – New name of the column  
    Returns a new DataFrame with a column renamed.
    """


df.withColumnRenamed("dob","DateOfBirth").printSchema()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- DateOfBirth: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [0]:
df1 = df.withColumnRenamed("firstname","First Name").withColumnRenamed("gender","Gender") \
        .withColumnRenamed("name","Name").withColumnRenamed("dob","Birth Date")

df1.printSchema()

root
 |-- Name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [0]:

schema2 = StructType([
    StructField("Fname",StringType()),
    StructField("Mname",StringType()),
    StructField("Lname",StringType())])


In [0]:
from pyspark.sql.functions import col

df.select(col("name").cast(schema2), \
     col("dob"), col("gender"), col("salary")) \
   .printSchema()  


root
 |-- name: struct (nullable = true)
 |    |-- Fname: string (nullable = true)
 |    |-- Mname: string (nullable = true)
 |    |-- Lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [0]:

from pyspark.sql.functions import *

df.printSchema()
df.show()

df.select(col("name.firstname").alias("fname"), \
          col("name.middlename").alias("mname"), \
          col("name.lastname").alias("lname"), \
          col("dob"),col("gender"),col("salary")) \
  .printSchema()
df.show()


root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+----------+------+------+
|                name|       dob|gender|salary|
+--------------------+----------+------+------+
|    {James, , Smith}|1991-04-01|     M|  3000|
|   {Michael, Rose, }|2000-05-19|     M|  4000|
|{Robert, , Williams}|1978-09-05|     M|  4000|
|{Maria, Anne, Jones}|1967-12-01|     F|  4000|
|  {Jen, Mary, Brown}|1980-02-17|     F|    -1|
+--------------------+----------+------+------+

root
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+--------------------+----------+------+------

In [0]:

from pyspark.sql.functions import *

df4 = df.withColumn("fname",col("name.firstname")) \
      .withColumn("mname",col("name.middlename")) \
      .withColumn("lname",col("name.lastname")) \
      .drop("name")

df4.printSchema()


root
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- mname: string (nullable = true)
 |-- lname: string (nullable = true)



In [0]:

IN progress


[0;36m  File [0;32m"<command-3462211225775218>"[0;36m, line [0;32m1[0m
[0;31m    IN progress[0m
[0m       ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax


In [0]:

newColumns = ["newCol1","newCol2","newCol3","newCol4"]

df.toDF(*newColumns).printSchema()


root
 |-- newCol1: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- newCol2: string (nullable = true)
 |-- newCol3: string (nullable = true)
 |-- newCol4: integer (nullable = true)

