In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DecimalType, IntegerType, DateType

In [0]:
# Initialize Spark session
spark = SparkSession.builder.appName("RetailSales").getOrCreate()

# Define User-Defined Schema
schema = StructType([
    StructField("sale_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("category", StringType(), True),
    StructField("price", DecimalType(10, 2), True),
    StructField("quantity", IntegerType(), True),
    StructField("sale_date", DateType(), True)
])

file_path = "/FileStore/tables/retail_sales.csv"


In [0]:
# PERMISSIVE Mode (Default - replaces errors with NULL)
df_permissive = spark.read.option("header", "true") \
    .schema(schema) \
    .option("mode", "PERMISSIVE") \
    .csv(file_path)

print("PERMISSIVE MODE:")
df_permissive.show()

PERMISSIVE MODE:
+-------+----------+-----------+------+--------+----------+
|sale_id|product_id|   category| price|quantity| sale_date|
+-------+----------+-----------+------+--------+----------+
|   S001|     P1001|Electronics|299.99|       2|2024-03-10|
|   S002|     P1002|   Clothing|  NULL|       1|2024-03-11|
|   S003|     P1003|  Furniture|  NULL|       3|2024-03-12|
|   S004|     P1004|Electronics|199.99|    NULL|2024-03-13|
|   S005|     P1005|   Clothing| 49.99|       5|2024-03-14|
|   S006|     P1006|    Grocery|  5.99|      10|2024-03-15|
|   S007|     P1007|     Beauty| 20.99|       2|2024-03-16|
|   S008|     P1008|Electronics|  NULL|       1|2024-03-17|
|   S009|     P1009|     Sports| 79.99|       4|2024-03-18|
|   S010|     P1010|  Furniture|399.99|    NULL|2024-03-19|
|   S011|     P1011|   Clothing| 25.49|       1|2024-03-20|
|   S012|     P1012|   Footwear| 59.99|       3|2024-03-21|
|   S013|     P1013|Electronics|  NULL|       2|2024-03-22|
|   S014|     P1014|   

In [0]:
# DROPMALFORMED Mode (Drops Bad Records)
df_dropMalformed = spark.read.option("header", "true") \
    .schema(schema) \
    .option("mode", "DROPMALFORMED") \
    .csv(file_path)

df_dropMalformed.printSchema()
print("DROPMALFORMED MODE:")
df_dropMalformed.show()

root
 |-- sale_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: decimal(10,2) (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- sale_date: date (nullable = true)

DROPMALFORMED MODE:
+-------+----------+-----------+------+--------+----------+
|sale_id|product_id|   category| price|quantity| sale_date|
+-------+----------+-----------+------+--------+----------+
|   S001|     P1001|Electronics|299.99|       2|2024-03-10|
|   S002|     P1002|   Clothing|  NULL|       1|2024-03-11|
|   S004|     P1004|Electronics|199.99|    NULL|2024-03-13|
|   S005|     P1005|   Clothing| 49.99|       5|2024-03-14|
|   S006|     P1006|    Grocery|  5.99|      10|2024-03-15|
|   S007|     P1007|     Beauty| 20.99|       2|2024-03-16|
|   S009|     P1009|     Sports| 79.99|       4|2024-03-18|
|   S011|     P1011|   Clothing| 25.49|       1|2024-03-20|
|   S012|     P1012|   Footwear| 59.99|       3|2024-03-21|
|   S013|

In [0]:
df_dropMalformed_clean = df_dropMalformed.dropna(subset=["price", "quantity"])
df_dropMalformed_clean.show()

+-------+----------+-----------+------+--------+----------+
|sale_id|product_id|   category| price|quantity| sale_date|
+-------+----------+-----------+------+--------+----------+
|   S001|     P1001|Electronics|299.99|       2|2024-03-10|
|   S005|     P1005|   Clothing| 49.99|       5|2024-03-14|
|   S006|     P1006|    Grocery|  5.99|      10|2024-03-15|
|   S007|     P1007|     Beauty| 20.99|       2|2024-03-16|
|   S009|     P1009|     Sports| 79.99|       4|2024-03-18|
|   S011|     P1011|   Clothing| 25.49|       1|2024-03-20|
|   S012|     P1012|   Footwear| 59.99|       3|2024-03-21|
|   S014|     P1014|    Grocery|  2.99|      20|2024-03-23|
|   S015|     P1015|     Beauty| 15.99|       1|2024-03-24|
|   S018|     P1018|Electronics|129.99|       2|2024-03-27|
|   S020|     P1020|   Footwear| 49.99|       1|2024-03-29|
+-------+----------+-----------+------+--------+----------+



In [0]:
# FAILFAST Mode (Fails on Error)
try:
    df_failfast = spark.read.option("header", "true") \
        .schema(schema) \
        .option("mode", "FAILFAST") \
        .csv(file_path)

    print("FAILFAST MODE:")
    df_failfast.show()

except Exception as e:
    print("FAILFAST MODE ERROR:", e)

FAILFAST MODE:
FAILFAST MODE ERROR: An error occurred while calling o466.showString.
: org.apache.spark.SparkException: [FAILED_READ_FILE.NO_HINT] Error while reading file dbfs:/FileStore/tables/retail_sales.csv.  SQLSTATE: KD001
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadFilesError(QueryExecutionErrors.scala:1095)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.logErrorFileNameAndThrow(FileScanRDD.scala:784)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:739)
	at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.$anonfun$hasNext$1(FileScanRDD.scala:553)
	at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
	at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD