# Spark DataFrame Basics

In [1]:
# In order to work with Spark DataFrames, we actually need to start a Spark Session:
from pyspark.sql import SparkSession

In [2]:
# We need to start the Spark Session by applying it:
spark = SparkSession.builder.appName("Basics").getOrCreate()

In [3]:
# We want to read a dataset into a DataFrame:
df = spark.read.json("people.json")

In [4]:
# Show some of the data in the df DataFrame
df.show()
# Notice how it automatically replaces missing data with `null` values.

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
# If we want to know the schema of a DataFrame:
df.printSchema()
# "long" represents a numeric integer type.

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
# We want to see the column names:
df.columns  # Notice that "columns" is just an attribute.

['age', 'name']

In [7]:
# To get a statistical summary of the DataFrame:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



### How to manually create/define the schema

(The Spark documentation on this is not very well clarified.)

In [8]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [9]:
# Next we need to create a list of structure fields.
data_schema = [StructField("age", IntegerType(), True),
              StructField("name", StringType(), True)]
# The column is called "age".  
# The class instance is IntegerType.
# The third parameter is a boolean:  explains whether or not the field can be `null`.

In [10]:
# Set up the final structure:
final_struc = StructType(fields=data_schema)

In [11]:
# Now Read in the .json file, but I am also going to clarify that I want the schema to be set to final_struc.
df = spark.read.json("people.json", schema=final_struc)

In [12]:
df.printSchema()
# Notice here that it has clarified the age as integer and not long.

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)

