#### SparkSession创建

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

##### 一、导入
##### 导入后类型都是DataFrame

In [3]:
# 直接导入
df = spark.read.load("examples/src/main/resources/users.parquet")
# 指定格式导入
df = spark.read.load("examples/src/main/resources/people.json", format="json")
# 指定格式和参数导入
df = spark.read.load("examples/src/main/resources/people.csv",\
                     format="csv", sep=":", inferSchema="true", header="true")
# 导入orc
df = spark.read.orc("examples/src/main/resources/users.orc")

##### 二、导出
##### mode参数控制文件保存

In [None]:
# Notes
# mode = ['error'(有就报错), 'append'(追加), 'overwrite'(覆盖), 'ignore'(忽略)]
# 直接保存
df.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
# 指定格式保存
df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
# orc保存
(df.write.format("orc")
    .option("orc.bloom.filter.columns", "favorite_color")
    .option("orc.dictionary.key.threshold", "1.0")
    .save("users_with_options.orc"))

##### 三、持久化到表
##### 3.1 持久化到表后，再次重启Session都还有数据。

In [None]:
df.write.option('path', '/some/path').saveAsTable('t')

##### 3.2 Bucketing, Sorting and Partitioning后持久化到表

In [None]:
# Bucketing and sorting are applicable only to persistent tables
df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
df.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet")

(df.write.partitionBy("favorite_color")\
         .bucketBy(42, "name")\
         .saveAsTable("people_partitioned_bucketed"))

##### 3.3 删除表

In [None]:
spark.sql("DROP TABLE IF EXISTS people_bucketed")