In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder \
.master("local[2]") \
.appName("Caching and Persistence of Data") \
.getOrCreate()

<h2>Caching and Persistence of Data</h2>

<p>What is the difference between caching and persistence? In Spark <strong>they are synonymous</strong>.
Two API calls, cache() and persist(), offer these capabilities. The latter provides
more control over how and where your data is stored—in memory and on disk,
serialized and unserialized. Both contribute to better performance for frequently
accessed DataFrames or tables.</p>

In [2]:
df = spark.range(1 * 10000000).toDF("id").withColumn("square", F.col("id") * F.col("id"))

In [3]:
import datetime as dt

In [4]:
start = dt.datetime.now()
df.cache()
df.count() # Materialize the cache
stop = dt.datetime.now()
print( (stop - start).total_seconds())

19.161103


In [5]:
start = dt.datetime.now()
df.count()
stop = dt.datetime.now()
print( (stop - start).total_seconds())

0.558911


<img src="../images/spark_ui_storage_cached_data.png"  />

## chache() works with the first action.

In [6]:
df2 = spark.range(1 * 20000000).toDF("id").withColumn("square", F.col("id") * F.col("id"))

In [7]:
from pyspark.storagelevel import StorageLevel

In [8]:
df2.persist(StorageLevel.DISK_ONLY) # Serialize the data and cache it on disk

DataFrame[id: bigint, square: bigint]

In [9]:
start = dt.datetime.now()
df2.count() 
stop = dt.datetime.now()
print( (stop - start).total_seconds())

17.079018


In [10]:
start = dt.datetime.now()
df2.count() # Now get it from the cache
stop = dt.datetime.now()
print( (stop - start).total_seconds())

0.648386


<img src="../images/spark_persist_ui_size_on_disk.png" />

<p>Pay attention that df2 is cached on disk while df is on memory</p>

In [11]:
# To unpersist your cached data

In [12]:
df2.unpersist()

DataFrame[id: bigint, square: bigint]

In [13]:
start = dt.datetime.now()
df2.count() # Now get it from the cache
stop = dt.datetime.now()
print( (stop - start).total_seconds() )

1.274264


## DataFrame.persist()

<p>persist(StorageLevel.LEVEL) is nuanced, providing control over how your data is 
cached via StorageLevel</p>

<img src="../images/dataframe_persist_storage_level.png" />

Source: Learning Spark, O'Reilly, 2020

In [14]:
spark.stop()