In [1]:
from pyspark.sql import SparkSession
import findspark
findspark.init()

In [2]:
spark = SparkSession.builder.\
        appName("test").\
        master('local[*]').getOrCreate()
sc = spark.sparkContext

In [6]:
rdd = sc.textFile("./people.txt").map(lambda x: x.split(", ")).\
        map(lambda x: (x[0], int(x[1])))

# 自动判别字段类型

In [8]:
df = spark.createDataFrame(rdd, schema=['name', 'age'])

In [9]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [10]:
df.show(10)

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 16|
+-------+---+



In [11]:
df.createTempView('people')

In [16]:
spark.sql('SELECT * FROM people WHERE age < 30').show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
| Justin| 16|
+-------+---+



# 使用StructType构建DataFrame表

In [18]:
from pyspark.sql.types import StructType, StringType, IntegerType

In [20]:
schema = StructType().add("name", StringType(), True).\
            add("age", IntegerType(), False)

In [21]:
df = spark.createDataFrame(rdd, schema=schema)

In [22]:
df.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 16|
+-------+---+



In [23]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = false)



# 使用RDD的toDF方法转换

In [26]:
df = rdd.toDF(schema=schema)

In [27]:
df.show()

+-------+---+
|   name|age|
+-------+---+
|Michael| 29|
|   Andy| 30|
| Justin| 16|
+-------+---+



# 源数据来自于Pandas的DataFrame

In [28]:
import pandas as pd

In [31]:
df_pd = pd.DataFrame({
    "id": [1, 2, 3],
    "name": ['Jack', 'Justin', 'Ala'],
    "age": [11, 13 ,14]
})

In [34]:
df = spark.createDataFrame(df_pd, schema=['id', 'name', 'age'])

In [35]:
df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|  Jack| 11|
|  2|Justin| 13|
|  3|   Ala| 14|
+---+------+---+



# 统一API从文件读取

In [49]:
schema = StructType().add("value", StringType(), True) # text文件
df = spark.read.format("text").schema(schema=schema).load("./people.txt")

In [50]:
df.show()

+-----------+
|      value|
+-----------+
|Michael, 29|
|   Andy, 30|
| Justin, 16|
+-----------+



In [51]:
df = spark.read.format("json").load("./people.json")  # json文件
df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  14|   Mike|
|  18|   Mike|
+----+-------+



In [52]:
schema = StructType().add("name", StringType(), True).\
            add("age", IntegerType(), False).add("job", StringType(), False)
df = spark.read.format("csv").option("sep", ";").\
    option("header", True).option("encoding", "utf-8").\
    schema(schema).load("./people.csv")

In [53]:
df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|  Manager|
|Alice|  9|Developer|
+-----+---+---------+



# DataFrame编程：DSL和SQL风格

In [54]:
df.show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|  Manager|
|Alice|  9|Developer|
+-----+---+---------+



In [57]:
df.select(["name", "age"]).show()

+-----+---+
| name|age|
+-----+---+
|Jorge| 30|
|  Bob| 32|
|Alice|  9|
+-----+---+



In [58]:
df.filter("age < 31").show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|Alice|  9|Developer|
+-----+---+---------+



In [60]:
df.filter(df['age'] < 31).show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|Alice|  9|Developer|
+-----+---+---------+



In [65]:
df.groupBy(df['name']).count().show() # groupBy的返回值是GroupData类型
# GroupData类型：有分组关系的数据结构
# 接上聚合函数API：sum, avg, count, min, max
# 通过聚合函数后返回的是DataFrame

+-----+-----+
| name|count|
+-----+-----+
|  Bob|    1|
|Jorge|    1|
|Alice|    1|
+-----+-----+

