In [1]:
import pyspark
from pyspark.sql import SparkSession
import findspark
findspark.init()
spark = SparkSession.builder.appName('test9').master('local[*]')\
        .enableHiveSupport().getOrCreate()  # sparkSQL 编程
sc = spark.sparkContext  # RDD编程

In [2]:
rdd = sc.parallelize([('jack', 20),
                       ('rose', 18)])

In [3]:
df = rdd.toDF(schema=['name', 'age'])

In [4]:
df.show()

+----+---+
|name|age|
+----+---+
|jack| 20|
|rose| 18|
+----+---+



In [5]:
rdd = df.rdd
rdd.collect()

[Row(name='jack', age=20), Row(name='rose', age=18)]

In [8]:
import pandas as pd
pd_df = pd.DataFrame(data=[('jack', 20), ('rose', 18)], columns=['name', 'age'])
pd_df.head()

Unnamed: 0,name,age
0,jack,20
1,rose,18


In [9]:
df = spark.createDataFrame(pd_df)
df.show()

+----+---+
|name|age|
+----+---+
|jack| 20|
|rose| 18|
+----+---+



In [10]:
pd_df = df.toPandas()
pd_df.head()

Unnamed: 0,name,age
0,jack,20
1,rose,18


In [11]:
data = [('jack', 20), ('rose', 18)]
df = spark.createDataFrame(data=data, schema=['name', 'age'])

In [12]:
df.show()

+----+---+
|name|age|
+----+---+
|jack| 20|
|rose| 18|
+----+---+



In [13]:
from datetime import datetime
data = [('jack', 20, datetime(2001, 1, 10)),
       ('rose', 18, datetime(2002, 1, 10)),
       ('tom', 20, datetime(2004, 1, 10)),]
rdd = sc.parallelize(data)
rdd.collect()

[('jack', 20, datetime.datetime(2001, 1, 10, 0, 0)),
 ('rose', 18, datetime.datetime(2002, 1, 10, 0, 0)),
 ('tom', 20, datetime.datetime(2004, 1, 10, 0, 0))]

In [14]:
df = spark.createDataFrame(rdd, schema=['name', 'age', 'birthday'])

In [15]:
df.show()

+----+---+-------------------+
|name|age|           birthday|
+----+---+-------------------+
|jack| 20|2001-01-10 00:00:00|
|rose| 18|2002-01-10 00:00:00|
| tom| 20|2004-01-10 00:00:00|
+----+---+-------------------+



In [17]:
df = rdd.toDF(schema=['name', 'age', 'birthday'])
# df.show()
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- birthday: timestamp (nullable = true)



In [19]:
schema = 'name string, age int, birth date'
df = spark.createDataFrame(rdd, schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- birth: date (nullable = true)

+----+---+----------+
|name|age|     birth|
+----+---+----------+
|jack| 20|2001-01-10|
|rose| 18|2002-01-10|
| tom| 20|2004-01-10|
+----+---+----------+



In [20]:
df = spark.read.json('json_data.json')
df.show()

+-----+------------+-----------+------------+
|label|petal_length|petal_width|sepal_length|
+-----+------------+-----------+------------+
|    0|         1.4|        0.2|         5.1|
+-----+------------+-----------+------------+



In [25]:
df = spark.read.csv('csv_data.csv', header=True, inferSchema=True)
df.show()
df.printSchema()

+------------+------------+-------------+------------+------+
|speal_length| sepal_width| petal_length| petal_width| label|
+------------+------------+-------------+------------+------+
|         5.1|         3.5|          1.4|         0.2|   0.0|
+------------+------------+-------------+------------+------+

root
 |-- speal_length: double (nullable = true)
 |--  sepal_width: double (nullable = true)
 |--  petal_length: double (nullable = true)
 |--  petal_width: double (nullable = true)
 |--  label: double (nullable = true)

