# Saving Data

## First, load some data ...

In [1]:
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
import os
spark = SparkSession.builder.master("local").getOrCreate()

In [2]:
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "..", "data"))

In [17]:
users = spark.read.option("header", "true").csv(f"{DATA_DIR}/users.csv")
messages = spark.read.option("header", "true").csv(f"{DATA_DIR}/messages.csv")
users.show(10, False)
messages.show(10, False)
users.printSchema()

+---+---------+--------+----------+
|id |firstname|lastname|dob       |
+---+---------+--------+----------+
|1  |John     |Smith   |2001-01-01|
|2  |Kim      |Melly   |1998-08-28|
+---+---------+--------+----------+

+----------+-------+-----------+-------------+----+---------+
|date      |m_title|m_body     |m_attachments|user|recipient|
+----------+-------+-----------+-------------+----+---------+
|2022-01-03|Title  |Hello World|null         |1   |p@ch.com |
|2022-01-02|Title 2|Hello World|null         |1   |d@ch.com |
+----------+-------+-----------+-------------+----+---------+

root
 |-- id: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)



## Write to parquet ...

In [25]:
users_path = f"{DATA_DIR}/parquet/users"
messages_path = f"{DATA_DIR}/parquet/messages"

(
    users
    .select("id", "firstname", "lastname", col("dob").cast("date"))
    .write.parquet(users_path, mode="overwrite")
)

(
    messages
    .withColumn("dt", col("date").cast("date"))
    .drop("date")
    .write.parquet(messages_path, mode="overwrite")
)

In [28]:
spark.read.parquet(users_path).printSchema()
spark.read.parquet(messages_path).printSchema()
os.listdir(users_path)

root
 |-- id: string (nullable = true)
 |-- firstname: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: date (nullable = true)

root
 |-- m_title: string (nullable = true)
 |-- m_body: string (nullable = true)
 |-- m_attachments: string (nullable = true)
 |-- user: string (nullable = true)
 |-- recipient: string (nullable = true)
 |-- dt: date (nullable = true)



['._SUCCESS.crc',
 '.part-00000-9764237b-2992-4dd5-ae43-fb5bc74b7d9d-c000.snappy.parquet.crc',
 '_SUCCESS',
 'part-00000-9764237b-2992-4dd5-ae43-fb5bc74b7d9d-c000.snappy.parquet']

## ... parquet with partitions ...