In [0]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

dept = [("Finance",10), 
        ("Marketing",20), 
        ("Sales",30), 
        ("IT",40) 
      ]
rdd = spark.sparkContext.parallelize(dept)

df = rdd.toDF()
df.printSchema()
df.show(truncate=False)

deptColumns = ["dept_name","dept_id"]
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)

deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show(truncate=False)


from pyspark.sql.types import StructType,StructField, StringType
deptSchema = StructType([       
    StructField('dept_name', StringType(), True),
    StructField('dept_id', StringType(), True)
])

deptDF1 = spark.createDataFrame(data=dept, schema = deptSchema)
deptDF1.printSchema()
deptDF1.show(truncate=False)

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+---------+---+
|_1       |_2 |
+---------+---+
|Finance  |10 |
|Marketing|20 |
|Sales    |30 |
|IT       |40 |
+---------+---+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: string (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [0]:
#Importing necessary modules: The pyspark module and the SparkSession class are imported from the pyspark.sql module.

#Creating a SparkSession: The SparkSession is created using the SparkSession.builder method with the application name set to 'SparkByExamples.com'. If a SparkSession already exists, it returns that instance; otherwise, it creates a new one.

#Creating data: The dept variable is defined as a list of tuples. Each tuple represents a department and consists of a department name and a department ID.

#Creating an RDD: An RDD named rdd is created using spark.sparkContext.parallelize(dept). The RDD is parallelized from the dept list.

#Converting RDD to DataFrame: The RDD is converted to a DataFrame named df using rdd.toDF(). Since the RDD does not have a specified schema, Spark infers the schema from the data.

#Printing DataFrame schema and contents: The df.printSchema() method is used to print the schema of the DataFrame. The df.show(truncate=False) method is used to display the contents of the DataFrame without truncating the output.

#Creating a DataFrame with specified column names: Another DataFrame named df2 is created from the RDD using rdd.toDF(deptColumns), where deptColumns is a list containing the column names.

#Printing DataFrame schema and contents: The schema and contents of df2 are printed using the same methods as before.

#Creating a DataFrame with specified schema: The dept list and deptColumns are used to create a DataFrame named deptDF explicitly specifying the schema using spark.createDataFrame(data=dept, schema=deptColumns).

#Printing DataFrame schema and contents: The schema and contents of deptDF are printed using the same methods as before.

#Defining a schema with StructType: The StructType class from the pyspark.sql.types module is imported. A deptSchema is defined using StructType and StructField to specify the schema structure.

#Creating a DataFrame with the custom schema: The dept list and deptSchema are used to create a DataFrame named deptDF1 with the specified schema using spark.createDataFrame(data=dept, schema=deptSchema).

#Printing DataFrame schema and contents: The schema and contents of deptDF1 are printed using the same methods as before.