In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("PersistingDataApp")
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")
                
                .getOrCreate()
        )

sc = spark.sparkContext

spark

In [0]:
# Read Yellow Taxis data
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .option("inferSchema", "true")                    
                    .csv("/FileStore/tables/YellowTaxis_202210.csv")
               )

# Aggregate the data
yellowTaxiGroupedDF = (
                            yellowTaxiDF
                                .dropDuplicates()
    
                                .groupBy("PULocationID")
                                .agg(sum("total_amount"))
                      )

### Save data without caching

In [0]:
(
    yellowTaxiGroupedDF    
            .write
    
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")
    
            .csv("/FileStore/tables/CacheTest_WithoutEnabling.csv")
)

### Apply persist operation on DataFrame

This will not cache data because it's a lazy operation

In [0]:
import pyspark

yellowTaxiGroupedDF.persist( pyspark.StorageLevel.MEMORY_AND_DISK )


Out[9]: DataFrame[PULocationID: int, sum(total_amount): double]

### Save data with caching enabled

This will save data to disk and cache the data

In [0]:
(
    yellowTaxiGroupedDF    
            .write
            
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")
    
            .csv("/FileStore/tables/CacheTest_EnabledFirstTime.csv")
)

### Save data to disk by using persisted data

This will take cached data and write to disk

In [0]:
(
    yellowTaxiGroupedDF    
            .write
            
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")
    
            .csv("/FileStore/tables/CacheTest_EnabledAndCached.csv")
)

### Unpersist data

This will remove cached partitions

In [0]:
yellowTaxiGroupedDF.unpersist()

Out[12]: DataFrame[PULocationID: int, sum(total_amount): double]