# 这个Tutorial


## 1. 通过SparkSession创建一个spark, 用于后续的动作。
## 2. 读取json文件
## 3. show()每一行。
## 4. printSchema()
## 5. select('name').show()每一行
## 6. select(df['name'],df['age']+1).show()每一行
## 7. filter(df['age']>=21).show()筛选show每一行
## 8. groupby(df['name']).count().show() 聚合show.

In [None]:
from pyspark.sql import SparkSession

In [24]:
# $example on:init_session$
# 所有的函数都是通过SparkSession来传递的。
spark = SparkSession.builder.appName("michael").config("spark.some.config.option", "some-value").getOrCreate()
# $example off:init_session$

In [25]:
# $example on:create_df$
# spark is an existing SparkSession
df = spark.read.json("people.json")

In [32]:
# Displays the content of the DataFrame to stdout
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [27]:
# $example on:untyped_ops$
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [28]:
# Select only the "name" column
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [29]:
# Select everybody, but increment the age by 1
# 这步仅仅只是show(), 并没有修改数值
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [31]:
# Select people older than 21
df.filter(df['age'] >= 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [33]:
# Count people by age
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [37]:
# $example on:run_sql$
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

In [38]:
sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [41]:
# $example on:global_temp_view$
# Register the DataFrame as a global temporary view
# 如果存在会报错， 用createOrReplaceGlobalTempView就好。
df.createGlobalTempView("people")

In [42]:
# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [43]:
# Global temporary view is cross-session
spark.newSession().sql("SELECT * FROM global_temp.people").show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [20]:
spark.stop()