In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
    .appName("SparkSql")\
    .master('local[2]')\
    .getOrCreate()
#getOrCreate()创建SparkSession对象

In [6]:
df = spark.read.json('../user.json')

In [7]:
df.createTempView('people')

In [11]:
#【SQL风格】
spark.sql('select * from people where age > 18').show()

+---+--------------+-----+
|age|       hobbies| name|
+---+--------------+-----+
| 19|[游戏, 羽毛球]|张三1|
| 19|[电视, 羽毛球]|张三4|
| 19|[小说, 乒乓球]|张三7|
+---+--------------+-----+



In [12]:
#【DSL风格】
df.where("age > 18").select("name", "age").show()

+-----+---+
| name|age|
+-----+---+
|张三1| 19|
|张三4| 19|
|张三7| 19|
+-----+---+



In [14]:
sc = spark.sparkContext
rdd = sc.textFile('../People.csv').map(lambda e: e.split('，'))

In [15]:
rdd.collect()

[['张三', '17', '男'],
 ['李四', '18', '女'],
 ['王五', '17', '女'],
 ['王二', '19', '男'],
 ['麻子', '16', '女'],
 ['李华', '20', '女'],
 ['刘明', '18', '男']]

In [19]:
#创建临时视图
df1 = rdd.toDF(["name", "age", "sex"])

In [20]:
#查看数据的结构
df1.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- sex: string (nullable = true)



In [21]:
rdd2 = rdd.map(lambda e: [e[0], int(e[1]), e[2]])
df2 = rdd2.toDF(["name", "age", "sex"])

In [22]:
df2.printSchema()


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- sex: string (nullable = true)



In [25]:
#基于RDD创建方式1：使用createDataFrame
# rdd.collect()
df1 = spark.createDataFrame(rdd, ["name", "age", "sex"])
df1.show()

+----+---+---+
|name|age|sex|
+----+---+---+
|张三| 17| 男|
|李四| 18| 女|
|王五| 17| 女|
|王二| 19| 男|
|麻子| 16| 女|
|李华| 20| 女|
|刘明| 18| 男|
+----+---+---+



In [27]:
from pyspark.sql.types import StructType, StringType,IntegerType

#基于RDD创建方式二：使用StructType类型添加Schema

schema = StructType()\
    .add("name", StringType(), nullable=False)\
    .add("age", IntegerType(), nullable=True)\
    .add("age", StringType(), nullable=False)\

#一个add方法 定义一个列的信息, 如果有3个列, 就写三个add
#add方法: 参数1: 列名称；参数2: 列类型；参数3: 是否允许为空

df2 = spark.createDataFrame(rdd2,schema)
df2.show()

+----+---+---+
|name|age|age|
+----+---+---+
|张三| 17| 男|
|李四| 18| 女|
|王五| 17| 女|
|王二| 19| 男|
|麻子| 16| 女|
|李华| 20| 女|
|刘明| 18| 男|
+----+---+---+

+----+---+---+
|name|age|age|
+----+---+---+
|张三| 17| 男|
|李四| 18| 女|
|王五| 17| 女|
|王二| 19| 男|
|麻子| 16| 女|
|李华| 20| 女|
|刘明| 18| 男|
+----+---+---+



In [28]:
# 基于RDD创建方式三：toDF
df3_1 = rdd2.toDF( ["name", "age", "sex"])
df3_1.show()

+----+---+---+
|name|age|sex|
+----+---+---+
|张三| 17| 男|
|李四| 18| 女|
|王五| 17| 女|
|王二| 19| 男|
|麻子| 16| 女|
|李华| 20| 女|
|刘明| 18| 男|
+----+---+---+



In [29]:
df3_2 = rdd2.toDF(schema)
df3_2.show()

+----+---+---+
|name|age|age|
+----+---+---+
|张三| 17| 男|
|李四| 18| 女|
|王五| 17| 女|
|王二| 19| 男|
|麻子| 16| 女|
|李华| 20| 女|
|刘明| 18| 男|
+----+---+---+

