In [1]:
import findspark

In [2]:
findspark.init('/usr/local/spark')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("DF Intro").getOrCreate()

In [5]:
df = spark.read.json("people.json")

In [6]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [7]:
from pyspark.sql.types import StringType,StructField,StructType,IntegerType

In [8]:
data_schema = [StructField('age',IntegerType(),True),StructField("name",StringType(),True)]

In [9]:
data_schema

[StructField(age,IntegerType,true), StructField(name,StringType,true)]

In [10]:
final_schema = StructType(fields=data_schema)

In [11]:
df1 = spark.read.json("people.json",schema=final_schema)

In [12]:
df1.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [13]:
df1.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [14]:
df1.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [15]:
df1.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [16]:
df1.head()

Row(age=None, name='Michael')

In [18]:
df1.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [19]:
df1.withColumnRenamed('age','Age').show()

+----+-------+
| Age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [20]:
df1.withColumnRenamed('name','Name').show()

+----+-------+
| age|   Name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [22]:
#Creating a temporary view (Table)
df1.createOrReplaceTempView('people')

In [23]:
spark.sql("SELECT * FROM people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [24]:
spark.sql("SELECT * FROM people where age >= 19").show()

+---+------+
|age|  name|
+---+------+
| 30|  Andy|
| 19|Justin|
+---+------+



In [25]:
spark.sql("SELECT * FROM people where age == 20").show()

+---+----+
|age|name|
+---+----+
+---+----+



In [26]:
spark.sql("SELECT * FROM people where age < 20").show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+

