# Saving Data

## First, load some data ...

In [2]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.master("local").getOrCreate()
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "data"))

In [4]:
users = spark.read.option("header", "true").csv(f"{DATA_DIR}/users.csv")
messages = spark.read.option("header", "true").csv(f"{DATA_DIR}/messages.csv")
users.show(10, False)
messages.show(10, False)
users.printSchema()

+---+---------+--------+----------+
|id |firstname|lastname|dob       |
+---+---------+--------+----------+
|1  |John     |Smith   |2001-01-01|
|2  |Kim      |Melly   |1998-08-28|
+---+---------+--------+----------+

+----------+-------+-----------+-------------+----+---------+
|date      |m_title|m_body     |m_attachments|user|recipient|
+----------+-------+-----------+-------------+----+---------+
|2022-01-03|Title  |Hello World|null         |2   |p@ch.com |
|2022-01-02|Title 2|Hello World|null         |3   |d@ch.com |
+----------+-------+-----------+-------------+----+---------+

root
 |-- id: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)



## Write to parquet ...

In [6]:
users_path = f"{DATA_DIR}/parquet/users"
messages_path = f"{DATA_DIR}/parquet/messages"

(
    users
    .select("id", "firstname", "lastname", col("dob").cast("date"))
    .write.parquet(users_path, mode="overwrite")
)

(
    messages
    .withColumn("dt", col("date").cast("date"))
    .drop("date")
    .write.parquet(messages_path, mode="overwrite")
)

In [7]:
spark.read.parquet(users_path).printSchema()
spark.read.parquet(messages_path).printSchema()
os.listdir(users_path)

root
 |-- id: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: date (nullable = true)

root
 |-- m_title: string (nullable = true)
 |-- m_body: string (nullable = true)
 |-- m_attachments: string (nullable = true)
 |-- user: string (nullable = true)
 |-- recipient: string (nullable = true)
 |-- dt: date (nullable = true)



['._SUCCESS.crc',
 '.part-00000-346f0b3a-e78e-4b9b-836a-c6c5ab84b430-c000.snappy.parquet.crc',
 '_SUCCESS',
 'part-00000-346f0b3a-e78e-4b9b-836a-c6c5ab84b430-c000.snappy.parquet']

## Writing parquet into partitions

Write the messages data, partitioned by message `dt`

In [11]:
write_path = f"{DATA_DIR}/tmp/messages"
messages = spark.read.parquet(messages_path)
messages.write.partitionBy("dt").mode("overwrite").parquet(write_path)
os.listdir(write_path)


['._SUCCESS.crc', '_SUCCESS', 'dt=2022-01-02', 'dt=2022-01-03']

In [12]:
# then to read
spark.read.parquet(f"{DATA_DIR}/tmp/messages/dt=2022-01-02").show()

+-------+-----------+-------------+----+---------+
|m_title|     m_body|m_attachments|user|recipient|
+-------+-----------+-------------+----+---------+
|Title 2|Hello World|         null|   3| d@ch.com|
+-------+-----------+-------------+----+---------+



In [13]:
# save as ... 
# json
json_path = f"{DATA_DIR}/tmp/messages-json"
messages.write.format("json").save(json_path)
# csv
json_path = f"{DATA_DIR}/tmp/messages-csv"
messages.write.format("csv").save(json_path)

which look like

```
{"m_title":"Title","m_body":"Hello World","user":"2","recipient":"p@ch.com","dt":"2022-01-03"}
{"m_title":"Title 2","m_body":"Hello World","user":"3","recipient":"d@ch.com","dt":"2022-01-02"}
```

and

```
Title,Hello World,"",2,p@ch.com,2022-01-03
Title 2,Hello World,"",3,d@ch.com,2022-01-02
```

respectively