In [4]:
from pyspark import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

**Create a session**

In [5]:
spark = SparkSession.builder.master("local[*]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

**Create data sample**

In [6]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]

**Create DF using toDF()**

In [8]:
dfFromRDD1 = spark.createDataFrame(data).toDF(*columns)
dfFromRDD1.printSchema()
dfFromRDD1.show()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



**Create DF using createDataFrame() with Row type**

In [9]:
rowData = map(lambda x: Row(*x), data)
dfFromRDD2 = spark.createDataFrame(rowData, columns)
dfFromRDD2.printSchema()

root
 |-- language: string (nullable = true)
 |-- users_count: string (nullable = true)



**Create data sample 2**

In [10]:
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

**Create StructType schema**

In [11]:
schema = StructType([StructField("firstname", StringType(), True),
                     StructField("middlename", StringType(), True),
                     StructField("lastname", StringType(), True),
                     StructField("id", StringType(), True),
                     StructField("gender", StringType(), True),
                     StructField("salary", IntegerType(), True)])

**Create DF with schema**

In [12]:
dfFromRDD3 = spark.createDataFrame(data = data2, schema= schema)
dfFromRDD3.printSchema()
dfFromRDD3.show()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [23]:
print(dfFromRDD3.schema.contains(StructField("firstname",StringType,true)))

AttributeError: ignored

**Create DF from file csv**

In [15]:
dfFromFile1 = spark.read.csv('/content/drive/MyDrive/Learn_pyspark/resources/zipcodes.csv', header= True)
dfFromFile1.printSchema()
dfFromFile1.show(10)

root
 |-- RecordNumber: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Lat: string (nullable = true)
 |-- Long: string (nullable = true)
 |-- Xaxis: string (nullable = true)
 |-- Yaxis: string (nullable = true)
 |-- Zaxis: string (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Decommisioned: string (nullable = true)
 |-- TaxReturnsFiled: string (nullable = true)
 |-- EstimatedPopulation: string (nullable = true)
 |-- TotalWages: string (nullable = true)
 |-- Notes: string (nullable = true)

+------------+-------+-----------+-------------------+-----+--------------+-----+-------+-----+-----+-----+-----------+-------+--------------------+--------------------+------------

**Create DF from file json**

In [16]:
dfFromFile2 = spark.read.json('/content/drive/MyDrive/Learn_pyspark/resources/zipcode1.json')
dfFromFile2.printSchema()

root
 |-- City: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Decommisioned: boolean (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- LocationText: string (nullable = true)
 |-- LocationType: string (nullable = true)
 |-- Long: double (nullable = true)
 |-- RecordNumber: long (nullable = true)
 |-- State: string (nullable = true)
 |-- WorldRegion: string (nullable = true)
 |-- Xaxis: double (nullable = true)
 |-- Yaxis: double (nullable = true)
 |-- Zaxis: double (nullable = true)
 |-- ZipCodeType: string (nullable = true)
 |-- Zipcode: long (nullable = true)



**Create empty RDD in pyspark**


*   Using emptyRDD()
*   Using parallelize()



In [17]:
emptyRDD = spark.sparkContext.emptyRDD()
emptyRDD1 = spark.sparkContext.parallelize([])

**Create empty DF with empty RDD**


*   Using createDataFrame()
*   Using toDF()



In [19]:
schema = StructType([StructField("firstName", StringType(), nullable= True),
                     StructField("middleName", StringType(), nullable= True),
                     StructField("lastName", StringType(), nullable= True),
                     StructField("gender", StringType(), nullable= True)])
emptyDF = spark.createDataFrame(emptyRDD, schema)
emptyDF1 = emptyRDD1.toDF(schema)
emptyDF.printSchema()
emptyDF1.printSchema()

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)



**Create empty DF with schema without RDD**

In [20]:
emptyDF2 = spark.createDataFrame([], schema)
emptyDF2.printSchema()

root
 |-- firstName: string (nullable = true)
 |-- middleName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- gender: string (nullable = true)



**Create empty DF without schema and RDD**

In [21]:
emptyDF3 = spark.createDataFrame([], StructType([]))
emptyDF3.printSchema()

root

