In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL data source example") \
    .getOrCreate()

### 通用的导入保存函数

In [8]:
df = spark.read.load('people.json', format='json')
print(type(df))
# df.select("name", "favorite_color").write.save("namesAndFavoColors.parquet", mode='overwrite')

<class 'pyspark.sql.dataframe.DataFrame'>


#### 1. 人工指定参数
##### json, parquet, jdbc, orc, libsvm, csv, text都可以

In [None]:
# 其中 format 可选.
# load json file
df_json = spark.read.load("examples/src/main/resources/people.json", format="json")

# 其中 format 可选, 
# mode 可选 ['error'(有就报错), 'append'(追加), 'overwrite'(覆盖), 'ignore'(忽略)]
# load parquet file
df_parquet.select("name", "age").write.save("namesAndAges.parquet", format="parquet", mode='ignore')

# load csv file
df_csv = spark.read.load("examples/src/main/resources/people.csv",\
                     format="csv", sep=":", inferSchema="true", header="true")

# load orc fils
df_orc = spark.read.orc("examples/src/main/resources/users.orc")
(df_orc.write.format("orc")\
       .option("orc.bloom.filter.columns", "favorite_color")\
       .option("orc.dictionary.key.threshold", "1.0")\
       .save("users_with_options.orc"))

#### 2. Run SQL on files directly

In [20]:
df_ = spark.sql("SELECT * FROM parquet.`namesAndFavoColors.parquet`")
print(type(df_))

<class 'pyspark.sql.dataframe.DataFrame'>


#### 3. 持久化到Tables
##### 持久化到Hive metastore
##### 即使Session重新启动, 只要保持connection不变, 表仍然存在.
##### 持久化的表, 可以用SQL方法.
##### 可以指定路径, 把text/parquet/json等文件持久化到表. df.write.option('path', '/some/path').saveAsTable('t').
##### 即使用户表被删除了, 数据仍然在.

#### 4. Bucketing, Sorting and Partitioning
##### Bucketing, Sorting仅适用于Table

In [9]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [17]:
df.write.bucketBy(42, "name").sortBy("name").saveAsTable("people_bucketed", mode='ignore')

In [22]:
df_.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet")

In [None]:
df = spark.read.parquet("examples/src/main/resources/users.parquet")
(df.write.partitionBy("favorite_color")\
         .bucketBy(42, "name")\
         .saveAsTable("people_partitioned_bucketed"))

In [28]:
spark.sql("SELECT * FROM parquet.`namesPartByColor.parquet`").show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          null|
|   Ben|           red|
+------+--------------+



In [29]:
spark.sql("DROP TABLE IF EXISTS people_bucketed")
# spark.sql("DROP TABLE IF EXISTS people_bucketed")

DataFrame[]

In [21]:
df_.show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          null|
|   Ben|           red|
+------+--------------+

