In [1]:
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
    .appName("SparkDemo1")\
    .getOrCreate()
    # .config("spark.sql.shuffle.partitions", "2")\

In [3]:
spark.sql("SELECT * FROM global_temp.housing").show()

AnalysisException: Table or view not found: global_temp.housing; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [global_temp, housing], [], false


In [2]:
pd_df = pd.DataFrame({
    "id": [1, 2, 3, 4],
     "name": ['哈哈', '啦啦', '嘻嘻', '嘿嘿'],
    "age":[ 15, 17, 12, 20]})

In [3]:
pd_df

Unnamed: 0,id,name,age
0,1,哈哈,15
1,2,啦啦,17
2,3,嘻嘻,12
3,4,嘿嘿,20


In [7]:
df1 = spark.createDataFrame(pd_df)

DataFrame[id: bigint, name: string, age: bigint]

In [13]:
df1.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|哈哈| 15|
|  2|啦啦| 17|
|  3|嘻嘻| 12|
|  4|嘿嘿| 20|
+---+----+---+



In [19]:
df2_1 = spark.read.format('text')\
                .load('../People.csv')
#默认展示二十行
df2_1.show()

+------------+
|       value|
+------------+
|张三，17，男|
|李四，18，女|
|王五，17，女|
|王二，19，男|
|麻子，16，女|
|李华，20，女|
|刘明，18，男|
+------------+



In [26]:
from pyspark.sql.types import StructType, IntegerType, StringType
# 只是支持一行
schema = StructType()\
    .add("name", StringType(), nullable=True)
    # .add("age", IntegerType(), nullable=True)
df2_2 = spark.read\
    .format('text')\
    .schema(schema)\
    .load('../People.csv')
#默认展示二十行
df2_2.show()

+------------+
|        name|
+------------+
|张三，17，男|
|李四，18，女|
|王五，17，女|
|王二，19，男|
|麻子，16，女|
|李华，20，女|
|刘明，18，男|
+------------+



In [30]:
df2_3 = spark.read.json('../user.jsons')
df2_3.show()

+-------+---+--------------+-----+
|address|age|       hobbies| name|
+-------+---+--------------+-----+
|   null| 19|[游戏, 羽毛球]|张三1|
|   null| 18|[篮球, 羽毛球]|张三2|
|   null| 17|[小说, 羽毛球]|张三3|
|   null| 19|[电视, 羽毛球]|张三4|
|   null| 17|  [游戏, 电影]|张三5|
|   null| 18|  [游戏, 篮球]|张三6|
| [郑州]| 19|[小说, 乒乓球]|张三7|
+-------+---+--------------+-----+



In [14]:
df3_1 = spark.read.format('csv')\
    .option('sep', ',')\
    .load('../housing.csv')

In [15]:
df3_1.printSchema()
df3_1.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|      _c0|     _c1|               _c2|        _c3|           _c4|       _c5|       _c6|          _c7|               _c8|            _c9|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|    

In [18]:
df4_1 = spark.read.format('json')\
    .load('../data/score_list.jsons')

In [19]:
df4_1.show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   92| Michael| Chinese| 2020|
|   87|    Andy| Chinese| 2020|
|   75|  Justin| Chinese| 2020|
|   62|   Berta| Chinese| 2020|
|   96| Michael|    math| 2020|
|   98|    Andy|    math| 2020|
|   78|  Justin|    math| 2020|
|   87|   Berta|    math| 2020|
|   87| Michael| Chinese| 2019|
|   90|    Andy| Chinese| 2019|
|   76|  Justin| Chinese| 2019|
|   74|   Berta| Chinese| 2019|
|   68| Michael|    math| 2019|
|   95|    Andy|    math| 2019|
|   87|  Justin|    math| 2019|
|   81|   Berta|    math| 2019|
|   95| Michael| Chinese| 2018|
|   91|    Andy| Chinese| 2018|
|   85|  Justin| Chinese| 2018|
|   77|   Berta| Chinese| 2018|
+-----+--------+--------+-----+
only showing top 20 rows



In [33]:
df4_1.groupBy("name","subject").avg("grade").show()

+--------+--------+-----------------+
|    name| subject|       avg(grade)|
+--------+--------+-----------------+
|    Andy|    math|97.33333333333333|
|  Justin|    math|81.33333333333333|
| Michael|    math|75.66666666666667|
|    Andy| Chinese|89.33333333333333|
|  Justin| Chinese|78.66666666666667|
|   Berta|    math|84.33333333333333|
| Michael| Chinese|91.33333333333333|
|   Berta| Chinese|             71.0|
+--------+--------+-----------------+



In [31]:
df4_1.groupBy("name").pivot("subject").avg("grade").show()

+--------+-----------------+-----------------+
|    name|          Chinese|             math|
+--------+-----------------+-----------------+
|  Justin|78.66666666666667|81.33333333333333|
|   Berta|             71.0|84.33333333333333|
| Michael|91.33333333333333|75.66666666666667|
|    Andy|89.33333333333333|97.33333333333333|
+--------+-----------------+-----------------+



In [35]:
df4_1.select("name").show()

+--------+
|    name|
+--------+
| Michael|
|    Andy|
|  Justin|
|   Berta|
| Michael|
|    Andy|
|  Justin|
|   Berta|
| Michael|
|    Andy|
|  Justin|
|   Berta|
| Michael|
|    Andy|
|  Justin|
|   Berta|
| Michael|
|    Andy|
|  Justin|
|   Berta|
+--------+
only showing top 20 rows



In [42]:
df4_1.select("name", "grade").show()

+--------+-----+
|    name|grade|
+--------+-----+
| Michael|   92|
|    Andy|   87|
|  Justin|   75|
|   Berta|   62|
| Michael|   96|
|    Andy|   98|
|  Justin|   78|
|   Berta|   87|
| Michael|   87|
|    Andy|   90|
|  Justin|   76|
|   Berta|   74|
| Michael|   68|
|    Andy|   95|
|  Justin|   87|
|   Berta|   81|
| Michael|   95|
|    Andy|   91|
|  Justin|   85|
|   Berta|   77|
+--------+-----+
only showing top 20 rows



In [40]:
df4_1.select("name", df4_1["grade"] + 1).show()

+--------+-----------+
|    name|(grade + 1)|
+--------+-----------+
| Michael|         93|
|    Andy|         88|
|  Justin|         76|
|   Berta|         63|
| Michael|         97|
|    Andy|         99|
|  Justin|         79|
|   Berta|         88|
| Michael|         88|
|    Andy|         91|
|  Justin|         77|
|   Berta|         75|
| Michael|         69|
|    Andy|         96|
|  Justin|         88|
|   Berta|         82|
| Michael|         96|
|    Andy|         92|
|  Justin|         86|
|   Berta|         78|
+--------+-----------+
only showing top 20 rows

+--------+-----+
|    name|grade|
+--------+-----+
| Michael|   92|
|    Andy|   87|
|  Justin|   75|
|   Berta|   62|
| Michael|   96|
|    Andy|   98|
|  Justin|   78|
|   Berta|   87|
| Michael|   87|
|    Andy|   90|
|  Justin|   76|
|   Berta|   74|
| Michael|   68|
|    Andy|   95|
|  Justin|   87|
|   Berta|   81|
| Michael|   95|
|    Andy|   91|
|  Justin|   85|
|   Berta|   77|
+--------+-----+
only showing t

In [50]:
df4_1.filter(df4_1['grade'] > 90).show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   92| Michael| Chinese| 2020|
|   96| Michael|    math| 2020|
|   98|    Andy|    math| 2020|
|   95|    Andy|    math| 2019|
|   95| Michael| Chinese| 2018|
|   91|    Andy| Chinese| 2018|
|   99|    Andy|    math| 2018|
+-----+--------+--------+-----+



In [53]:
df4_1.groupBy("grade").count().show()

+-----+-----+
|grade|count|
+-----+-----+
|   77|    1|
|   98|    1|
|   95|    2|
|   68|    1|
|   87|    4|
|   63|    1|
|   79|    1|
|   96|    1|
|   85|    2|
|   74|    1|
|   62|    1|
|   76|    1|
|   92|    1|
|   75|    1|
|   78|    1|
|   81|    1|
|   90|    1|
|   99|    1|
|   91|    1|
+-----+-----+



In [56]:
df4_1.collect()[0]

Row(grade=92, name=' Michael', subject=' Chinese', year=' 2020')

In [57]:
df4_1.columns

['grade', 'name', 'subject', 'year']

In [60]:
df4_1.describe().show()

+-------+------------------+--------+--------+-----------------+
|summary|             grade|    name| subject|             year|
+-------+------------------+--------+--------+-----------------+
|  count|                24|      24|      24|               24|
|   mean|            83.625|    null|    null|           2019.0|
| stddev|10.499741197638697|    null|    null|0.834057656228437|
|    min|                62|    Andy| Chinese|             2018|
|    max|                99| Michael|    math|             2020|
+-------+------------------+--------+--------+-----------------+



In [63]:
df4_1.describe('grade').show()

+-------+------------------+
|summary|             grade|
+-------+------------------+
|  count|                24|
|   mean|            83.625|
| stddev|10.499741197638697|
|    min|                62|
|    max|                99|
+-------+------------------+



In [66]:
df4_2 = df4_1.describe('grade')
df4_2.filter(df4_2["summary"] == "max").show()

+-------+-----+
|summary|grade|
+-------+-----+
|    max|   99|
+-------+-----+



In [79]:
# filter 参数可以传入一个字符串
df4_2.filter("summary = 'max' ").show()

+-------+-----+
|summary|grade|
+-------+-----+
|    max|   99|
+-------+-----+



In [68]:
df4_1.first()

Row(grade=92, name=' Michael', subject=' Chinese', year=' 2020')

In [72]:
# 默认一行
df4_1.head()

Row(grade=92, name=' Michael', subject=' Chinese', year=' 2020')

In [74]:
# 获取前n行记录
df4_1.take(2)

[Row(grade=92, name=' Michael', subject=' Chinese', year=' 2020'),
 Row(grade=87, name=' Andy', subject=' Chinese', year=' 2020')]

In [92]:
df4_1.filter("subject = 'math' or grade > 60").show(6)

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   92| Michael| Chinese| 2020|
|   87|    Andy| Chinese| 2020|
|   75|  Justin| Chinese| 2020|
|   62|   Berta| Chinese| 2020|
|   96| Michael|    math| 2020|
|   98|    Andy|    math| 2020|
+-----+--------+--------+-----+
only showing top 6 rows



In [110]:
# math前面有个空格
df4_1.filter("subject = ' math' and grade > 60").show()

+-----+--------+-------+-----+
|grade|    name|subject| year|
+-----+--------+-------+-----+
|   96| Michael|   math| 2020|
|   98|    Andy|   math| 2020|
|   78|  Justin|   math| 2020|
|   87|   Berta|   math| 2020|
|   68| Michael|   math| 2019|
|   95|    Andy|   math| 2019|
|   87|  Justin|   math| 2019|
|   81|   Berta|   math| 2019|
|   63| Michael|   math| 2018|
|   99|    Andy|   math| 2018|
|   79|  Justin|   math| 2018|
|   85|   Berta|   math| 2018|
+-----+--------+-------+-----+



In [111]:
df4_1.filter(df4_1['subject'] == " Chinese").show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   92| Michael| Chinese| 2020|
|   87|    Andy| Chinese| 2020|
|   75|  Justin| Chinese| 2020|
|   62|   Berta| Chinese| 2020|
|   87| Michael| Chinese| 2019|
|   90|    Andy| Chinese| 2019|
|   76|  Justin| Chinese| 2019|
|   74|   Berta| Chinese| 2019|
|   95| Michael| Chinese| 2018|
|   91|    Andy| Chinese| 2018|
|   85|  Justin| Chinese| 2018|
|   77|   Berta| Chinese| 2018|
+-----+--------+--------+-----+



In [108]:
df4_1.filter("subject=' Chinese'").show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   92| Michael| Chinese| 2020|
|   87|    Andy| Chinese| 2020|
|   75|  Justin| Chinese| 2020|
|   62|   Berta| Chinese| 2020|
|   87| Michael| Chinese| 2019|
|   90|    Andy| Chinese| 2019|
|   76|  Justin| Chinese| 2019|
|   74|   Berta| Chinese| 2019|
|   95| Michael| Chinese| 2018|
|   91|    Andy| Chinese| 2018|
|   85|  Justin| Chinese| 2018|
|   77|   Berta| Chinese| 2018|
+-----+--------+--------+-----+



In [113]:
df4_1.select("name", "grade").show(3)

+--------+-----+
|    name|grade|
+--------+-----+
| Michael|   92|
|    Andy|   87|
|  Justin|   75|
+--------+-----+
only showing top 3 rows



In [208]:
# df4_1.select($'grade' + 10).show(2)
df4_1.select(df4_1.grade + 10).show(2)

TypeError: can only concatenate str (not "int") to str

In [116]:
# 返回一个新的DataFrame
df4_1.drop("name").show()

+-----+--------+-----+
|grade| subject| year|
+-----+--------+-----+
|   92| Chinese| 2020|
|   87| Chinese| 2020|
|   75| Chinese| 2020|
|   62| Chinese| 2020|
|   96|    math| 2020|
|   98|    math| 2020|
|   78|    math| 2020|
|   87|    math| 2020|
|   87| Chinese| 2019|
|   90| Chinese| 2019|
|   76| Chinese| 2019|
|   74| Chinese| 2019|
|   68|    math| 2019|
|   95|    math| 2019|
|   87|    math| 2019|
|   81|    math| 2019|
|   95| Chinese| 2018|
|   91| Chinese| 2018|
|   85| Chinese| 2018|
|   77| Chinese| 2018|
+-----+--------+-----+
only showing top 20 rows



In [120]:
# limit 和 head不同，limit不是一个行动算子
df4_1.limit(2).show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   92| Michael| Chinese| 2020|
|   87|    Andy| Chinese| 2020|
+-----+--------+--------+-----+



In [122]:
df4_1.orderBy("grade").show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   62|   Berta| Chinese| 2020|
|   63| Michael|    math| 2018|
|   68| Michael|    math| 2019|
|   74|   Berta| Chinese| 2019|
|   75|  Justin| Chinese| 2020|
|   76|  Justin| Chinese| 2019|
|   77|   Berta| Chinese| 2018|
|   78|  Justin|    math| 2020|
|   79|  Justin|    math| 2018|
|   81|   Berta|    math| 2019|
|   85|   Berta|    math| 2018|
|   85|  Justin| Chinese| 2018|
|   87|   Berta|    math| 2020|
|   87| Michael| Chinese| 2019|
|   87|    Andy| Chinese| 2020|
|   87|  Justin|    math| 2019|
|   90|    Andy| Chinese| 2019|
|   91|    Andy| Chinese| 2018|
|   92| Michael| Chinese| 2020|
|   95|    Andy|    math| 2019|
+-----+--------+--------+-----+
only showing top 20 rows



In [123]:
df4_1.orderBy(df4_1["grade"].desc()).show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   99|    Andy|    math| 2018|
|   98|    Andy|    math| 2020|
|   96| Michael|    math| 2020|
|   95|    Andy|    math| 2019|
|   95| Michael| Chinese| 2018|
|   92| Michael| Chinese| 2020|
|   91|    Andy| Chinese| 2018|
|   90|    Andy| Chinese| 2019|
|   87|  Justin|    math| 2019|
|   87|   Berta|    math| 2020|
|   87|    Andy| Chinese| 2020|
|   87| Michael| Chinese| 2019|
|   85|  Justin| Chinese| 2018|
|   85|   Berta|    math| 2018|
|   81|   Berta|    math| 2019|
|   79|  Justin|    math| 2018|
|   78|  Justin|    math| 2020|
|   77|   Berta| Chinese| 2018|
|   76|  Justin| Chinese| 2019|
|   75|  Justin| Chinese| 2020|
+-----+--------+--------+-----+
only showing top 20 rows



In [124]:
df4_1.orderBy(- df4_1["grade"]).show()

+-----+--------+--------+-----+
|grade|    name| subject| year|
+-----+--------+--------+-----+
|   99|    Andy|    math| 2018|
|   98|    Andy|    math| 2020|
|   96| Michael|    math| 2020|
|   95|    Andy|    math| 2019|
|   95| Michael| Chinese| 2018|
|   92| Michael| Chinese| 2020|
|   91|    Andy| Chinese| 2018|
|   90|    Andy| Chinese| 2019|
|   87|  Justin|    math| 2019|
|   87|   Berta|    math| 2020|
|   87|    Andy| Chinese| 2020|
|   87| Michael| Chinese| 2019|
|   85|  Justin| Chinese| 2018|
|   85|   Berta|    math| 2018|
|   81|   Berta|    math| 2019|
|   79|  Justin|    math| 2018|
|   78|  Justin|    math| 2020|
|   77|   Berta| Chinese| 2018|
|   76|  Justin| Chinese| 2019|
|   75|  Justin| Chinese| 2020|
+-----+--------+--------+-----+
only showing top 20 rows



In [128]:
df4_1.join(df3_1, 'name').show()

+--------+-----+--------+-----+-----+--------+-----+
|    name|grade| subject| year|grade| subject| year|
+--------+-----+--------+-----+-----+--------+-----+
| Michael|   92| Chinese| 2020|   63|    math| 2018|
| Michael|   92| Chinese| 2020|   95| Chinese| 2018|
| Michael|   92| Chinese| 2020|   68|    math| 2019|
| Michael|   92| Chinese| 2020|   87| Chinese| 2019|
| Michael|   92| Chinese| 2020|   96|    math| 2020|
| Michael|   92| Chinese| 2020|   92| Chinese| 2020|
|    Andy|   87| Chinese| 2020|   99|    math| 2018|
|    Andy|   87| Chinese| 2020|   91| Chinese| 2018|
|    Andy|   87| Chinese| 2020|   95|    math| 2019|
|    Andy|   87| Chinese| 2020|   90| Chinese| 2019|
|    Andy|   87| Chinese| 2020|   98|    math| 2020|
|    Andy|   87| Chinese| 2020|   87| Chinese| 2020|
|  Justin|   75| Chinese| 2020|   79|    math| 2018|
|  Justin|   75| Chinese| 2020|   85| Chinese| 2018|
|  Justin|   75| Chinese| 2020|   87|    math| 2019|
|  Justin|   75| Chinese| 2020|   76| Chinese|

In [130]:
df4_1.join(df3_1, 'name', "inner").show()

+--------+-----+--------+-----+-----+--------+-----+
|    name|grade| subject| year|grade| subject| year|
+--------+-----+--------+-----+-----+--------+-----+
| Michael|   92| Chinese| 2020|   63|    math| 2018|
| Michael|   92| Chinese| 2020|   95| Chinese| 2018|
| Michael|   92| Chinese| 2020|   68|    math| 2019|
| Michael|   92| Chinese| 2020|   87| Chinese| 2019|
| Michael|   92| Chinese| 2020|   96|    math| 2020|
| Michael|   92| Chinese| 2020|   92| Chinese| 2020|
|    Andy|   87| Chinese| 2020|   99|    math| 2018|
|    Andy|   87| Chinese| 2020|   91| Chinese| 2018|
|    Andy|   87| Chinese| 2020|   95|    math| 2019|
|    Andy|   87| Chinese| 2020|   90| Chinese| 2019|
|    Andy|   87| Chinese| 2020|   98|    math| 2020|
|    Andy|   87| Chinese| 2020|   87| Chinese| 2020|
|  Justin|   75| Chinese| 2020|   79|    math| 2018|
|  Justin|   75| Chinese| 2020|   85| Chinese| 2018|
|  Justin|   75| Chinese| 2020|   87|    math| 2019|
|  Justin|   75| Chinese| 2020|   76| Chinese|

In [196]:
df4_2 = df4_1.describe("grade")
res = df4_2.filter("summary='mean'").select("grade")

In [204]:
res.show()

+------+
| grade|
+------+
|83.625|
+------+



In [205]:
sum_res = df4_1.select("grade").count() * 83.625

In [206]:

from  pyspark.sql.functions import when, lit, col
df4_1.withColumn("proportion", col("grade")/ sum_res).show()

+-----+--------+--------+-----+--------------------+
|grade|    name| subject| year|          proportion|
+-----+--------+--------+-----+--------------------+
|   92| Michael| Chinese| 2020|  0.0458395615346288|
|   87|    Andy| Chinese| 2020|0.043348281016442454|
|   75|  Justin| Chinese| 2020| 0.03736920777279522|
|   62|   Berta| Chinese| 2020| 0.03089187842551071|
|   96| Michael|    math| 2020| 0.04783258594917788|
|   98|    Andy|    math| 2020|0.048829098156452415|
|   78|  Justin|    math| 2020| 0.03886397608370702|
|   87|   Berta|    math| 2020|0.043348281016442454|
|   87| Michael| Chinese| 2019|0.043348281016442454|
|   90|    Andy| Chinese| 2019| 0.04484304932735426|
|   76|  Justin| Chinese| 2019|0.037867463876432486|
|   74|   Berta| Chinese| 2019| 0.03687095166915795|
|   68| Michael|    math| 2019| 0.03388141504733433|
|   95|    Andy|    math| 2019| 0.04733432984554061|
|   87|  Justin|    math| 2019|0.043348281016442454|
|   81|   Berta|    math| 2019| 0.040358744394

+-------+------------------+
|summary|             grade|
+-------+------------------+
|  count|                24|
|   mean|            83.625|
| stddev|10.499741197638697|
|    min|                62|
|    max|                99|
+-------+------------------+

