In [1]:
from pyspark.sql import SparkSession
import findspark
findspark.init()
spark = SparkSession.Builder().master("local[*]").getOrCreate()
sc = spark.sparkContext

In [2]:
from pyspark.sql.types import *
from pyspark.sql import types

id_ = StructField('id', LongType())
name = StructField('name', StringType())
class_ = StructField('class', IntegerType())

sex = StructField('sex', StringType())   # 和命名有关
age = StructField('age', IntegerType())
info = StructField('info', StructType([sex, age]))

interest = StructField('interest', ArrayType(StringType()))
score = StructField('score', MapType(StringType(), IntegerType()))

In [3]:
data = [
    [1, "建国", 1, {'sex': '男', 'age': 17}, ['篮球', '足球'], {'ch': 92, 'en': 75, 'math': 67}],
    [2, "秀兰", 1, {'sex': '女', 'age': 17}, ['排球', '爬山', '唱歌'], {'ch': 92, 'en': 45}],
    [3, "小妹", 1, {'sex': '女', 'age': 15}, ['篮球', '足球', '跳舞', 'kpop'], {'ch': 92, 'en': 75, 'math': 47}],
    [4, "翠花", 1, {'sex': '女', 'age': 14}, ['美术'], {'ch': 92}],
]

In [4]:
schema = StructType([id_, name, class_, info, interest, score])
df = spark.createDataFrame(data=data, schema=schema)
df.show()

+---+----+-----+--------+------------------------+--------------------+
| id|name|class|    info|                interest|               score|
+---+----+-----+--------+------------------------+--------------------+
|  1|建国|    1|{男, 17}|            [篮球, 足球]|{en -> 75, math -...|
|  2|秀兰|    1|{女, 17}|      [排球, 爬山, 唱歌]|{ch -> 92, en -> 45}|
|  3|小妹|    1|{女, 15}|[篮球, 足球, 跳舞, kpop]|{en -> 75, math -...|
|  4|翠花|    1|{女, 14}|                  [美术]|          {ch -> 92}|
+---+----+-----+--------+------------------------+--------------------+



In [5]:
schema = """
id long,
name string,
class integer,
info struct<sex:string, age:integer>,
interest array<string>,
score map<string, integer>
"""
df = spark.createDataFrame(data=data, schema=schema)
df.show()

+---+----+-----+--------+------------------------+--------------------+
| id|name|class|    info|                interest|               score|
+---+----+-----+--------+------------------------+--------------------+
|  1|建国|    1|{男, 17}|            [篮球, 足球]|{en -> 75, math -...|
|  2|秀兰|    1|{女, 17}|      [排球, 爬山, 唱歌]|{ch -> 92, en -> 45}|
|  3|小妹|    1|{女, 15}|[篮球, 足球, 跳舞, kpop]|{en -> 75, math -...|
|  4|翠花|    1|{女, 14}|                  [美术]|          {ch -> 92}|
+---+----+-----+--------+------------------------+--------------------+



In [6]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

In [7]:
df = spark.createDataFrame([
    (1, 'tony', 1.73, date(2000, 1, 1), datetime(2020, 1, 1, 12, 0)),
    (2, 'tony1', 1.74, date(2000, 1, 2), datetime(2020, 1, 2, 12, 0)),
    (3, 'tony2', 1.75, date(2000, 1, 3), datetime(2020, 1, 3, 12, 0))],
    schema='uid long, name string, height double, birth date, dt timestamp')
df.show()

+---+-----+------+----------+-------------------+
|uid| name|height|     birth|                 dt|
+---+-----+------+----------+-------------------+
|  1| tony|  1.73|2000-01-01|2020-01-01 12:00:00|
|  2|tony1|  1.74|2000-01-02|2020-01-02 12:00:00|
|  3|tony2|  1.75|2000-01-03|2020-01-03 12:00:00|
+---+-----+------+----------+-------------------+



In [8]:
pandas_df = pd.DataFrame({
    'uid': [1, 2, 3],
    'name': ['tony', 'jack', 'mike'],
    'height': [1.73, 1.78, 1.83],
    'birth': [date(2000, 1, 1),
             date(2000, 2, 1),
             date(2000, 3, 1)],
    'dt': [datetime(2020,1,1,12,0),
          datetime(2020,1,2,12,0),
          datetime(2020,1,3,12,1)]
})
df_from_pandas = spark.createDataFrame(pandas_df)
df_from_pandas.show()

+---+----+------+----------+-------------------+
|uid|name|height|     birth|                 dt|
+---+----+------+----------+-------------------+
|  1|tony|  1.73|2000-01-01|2020-01-01 12:00:00|
|  2|jack|  1.78|2000-02-01|2020-01-02 12:00:00|
|  3|mike|  1.83|2000-03-01|2020-01-03 12:01:00|
+---+----+------+----------+-------------------+



In [9]:
rdd = sc.parallelize([
    (1, 'tony', 1.73, date(2000, 1, 1), datetime(2020, 1, 1, 12, 0)),
    (2, 'tony1', 1.74, date(2000, 1, 2), datetime(2020, 1, 2, 12, 0)),
    (3, 'tony2', 1.75, date(2000, 1, 3), datetime(2020, 1, 3, 12, 0))
])
df_from_rdd = spark.createDataFrame(rdd, schema=['uid', 'name', 'height', 'birth', 'dt'])
df_from_rdd.show()

+---+-----+------+----------+-------------------+
|uid| name|height|     birth|                 dt|
+---+-----+------+----------+-------------------+
|  1| tony|  1.73|2000-01-01|2020-01-01 12:00:00|
|  2|tony1|  1.74|2000-01-02|2020-01-02 12:00:00|
|  3|tony2|  1.75|2000-01-03|2020-01-03 12:00:00|
+---+-----+------+----------+-------------------+



In [10]:
df_from_rdd.printSchema()

root
 |-- uid: long (nullable = true)
 |-- name: string (nullable = true)
 |-- height: double (nullable = true)
 |-- birth: date (nullable = true)
 |-- dt: timestamp (nullable = true)



In [11]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

uid,name,height,birth,dt
1,tony,1.73,2000-01-01,2020-01-01 12:00:00
2,tony1,1.74,2000-01-02,2020-01-02 12:00:00
3,tony2,1.75,2000-01-03,2020-01-03 12:00:00


In [12]:
df_from_rdd

uid,name,height,birth,dt
1,tony,1.73,2000-01-01,2020-01-01 12:00:00
2,tony1,1.74,2000-01-02,2020-01-02 12:00:00
3,tony2,1.75,2000-01-03,2020-01-03 12:00:00


In [13]:
df_from_rdd.collect()

[Row(uid=1, name='tony', height=1.73, birth=datetime.date(2000, 1, 1), dt=datetime.datetime(2020, 1, 1, 12, 0)),
 Row(uid=2, name='tony1', height=1.74, birth=datetime.date(2000, 1, 2), dt=datetime.datetime(2020, 1, 2, 12, 0)),
 Row(uid=3, name='tony2', height=1.75, birth=datetime.date(2000, 1, 3), dt=datetime.datetime(2020, 1, 3, 12, 0))]

In [14]:
df_from_rdd.select(df_from_rdd.name).show()

+-----+
| name|
+-----+
| tony|
|tony1|
|tony2|
+-----+



In [15]:
from pyspark.sql.functions import upper
df_from_rdd.withColumn('upper_name', upper(df_from_rdd.name)).show()

+---+-----+------+----------+-------------------+----------+
|uid| name|height|     birth|                 dt|upper_name|
+---+-----+------+----------+-------------------+----------+
|  1| tony|  1.73|2000-01-01|2020-01-01 12:00:00|      TONY|
|  2|tony1|  1.74|2000-01-02|2020-01-02 12:00:00|     TONY1|
|  3|tony2|  1.75|2000-01-03|2020-01-03 12:00:00|     TONY2|
+---+-----+------+----------+-------------------+----------+



In [16]:
df_from_rdd.filter(df_from_rdd.uid==1).show()

+---+----+------+----------+-------------------+
|uid|name|height|     birth|                 dt|
+---+----+------+----------+-------------------+
|  1|tony|  1.73|2000-01-01|2020-01-01 12:00:00|
+---+----+------+----------+-------------------+



In [17]:
from pyspark.sql.functions import pandas_udf
# 装饰器抽象讲就是为已经存在的对象添加额外的功能
@pandas_udf('long')
def pandas_func(col: pd.Series) -> pd.Series:
    return col*100

df_from_rdd.select(pandas_func(df_from_rdd.height)).show()

+-------------------+
|pandas_func(height)|
+-------------------+
|                173|
|                174|
|                175|
+-------------------+



In [18]:
def pandas_func(DataFrame):
    for row in DataFrame:
        yield row[row.uid == 1]
df_from_rdd.mapInPandas(pandas_func, schema=df_from_rdd.schema).show()

+---+----+------+----------+-------------------+
|uid|name|height|     birth|                 dt|
+---+----+------+----------+-------------------+
|  1|tony|  1.73|2000-01-01|2020-01-01 12:00:00|
+---+----+------+----------+-------------------+

