In [3]:
from pyspark import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

**Create a session**

In [4]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

**Create data sample 1**

In [6]:
data1  = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]

**Create RDD using parallelize()**

In [7]:
rdd = spark.sparkContext.parallelize(data1)

**Convert RDD to DF using toDF()**

In [8]:
df1 = rdd.toDF()
df1.printSchema()
df1.show()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+



In [9]:
columns = ["name","id"]
df2 = rdd.toDF(columns)
df2.printSchema()
df2.show()

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = true)

+---------+---+
|     name| id|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+



**Convert RDD to DF using createDataFrame()**

In [10]:
df3 = spark.createDataFrame(rdd, schema = columns)
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- id: long (nullable = true)

+---------+---+
|     name| id|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+



**Convert RDD to DF using createDataFrame() with StructType schema**

In [11]:
schema = StructType([StructField('Dept_name', StringType(), True),
                     StructField('Dept_id', IntegerType(), True)])

df4 = spark.createDataFrame(rdd, schema = schema)
df4.printSchema()
df4.show()

root
 |-- Dept_name: string (nullable = true)
 |-- Dept_id: integer (nullable = true)

+---------+-------+
|Dept_name|Dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



**Convert DF to pandas**

In [12]:
dataStruct = [(("James","","Smith"),"36636","M","3000"), \
      (("Michael","Rose",""),"40288","M","4000"), \
      (("Robert","","Williams"),"42114","M","4000"), \
      (("Maria","Anne","Jones"),"39192","F","4000"), \
      (("Jen","Mary","Brown"),"","F","-1") \
]

schemaStruct = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
          StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', StringType(), True)
         ])
df5 = spark.createDataFrame(data=dataStruct, schema = schemaStruct)
df5.printSchema()

pandasDF2 = df5.toPandas()
print(pandasDF2)

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)

                   name    dob gender salary
0      (James, , Smith)  36636      M   3000
1     (Michael, Rose, )  40288      M   4000
2  (Robert, , Williams)  42114      M   4000
3  (Maria, Anne, Jones)  39192      F   4000
4    (Jen, Mary, Brown)             F     -1
