In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkDataFrame').getOrCreate()

In [18]:
!hdfs dfs -ls /data/

Found 4 items
-rw-r--r--   2 root hadoop    1048576 2025-06-03 03:00 /data/customers10.csv
-rw-r--r--   2 root hadoop    1048576 2025-06-05 03:18 /data/customers150.csv
-rw-r--r--   2 root hadoop    1048576 2025-06-04 03:32 /data/customers500.csv
-rw-r--r--   2 root hadoop        511 2025-06-06 02:33 /data/malformed_customers.csv


In [36]:
!hdfs dfs -cat /data/malformed_customers.csv

id,name,age,is_active,date
1,John,25,true,2024-06-01
2,Alice,30,false,2024-06-02
3,Bob,twenty,true,2024-06-03
4,Charlie,40,,2024-06-04
5,David,abc,true,not_a_date
6,Eve,,false,2024-06-06
7,,28,true,2024-06-07
8,Frank,35,yes,2024-06-08
9,Gina,45,true,2024-06-09
10,Hank,28,false,
11,Ivy,29,,2024-06-11
12,,thirty,false,2024-06-12
13,Jack,32,maybe,2024-06-13
14,Kim,27,true,
15,Luke,33,false,2024-06-15
16,Mia,31,true,2024-06-16
17,Nina,27,false,2024-06-17
18,Oscar,NaN,true,2024-06-18
19,Paul,29,true,2024-06-19


In [17]:
import pandas as pd

# Create your pandas DataFrame
data = [
    [1, "John", 25, "true", "2024-06-01"],
    [2, "Alice", 30, "false", "2024-06-02"],
    [3, "Bob", "twenty", "true", "2024-06-03"],
    [4, "Charlie", 40, None, "2024-06-04"],
    [5, "David", "abc", "true", "not_a_date"],
    [6, "Eve", None, "false", "2024-06-06"],
    [7, None, 28, "true", "2024-06-07"],
    [8, "Frank", 35, "yes", "2024-06-08"],
    [9, "Gina", 45, "true", "2024-06-09"],
    [10, "Hank", 28, "false", None],
    [11, "Ivy", 29, None, "2024-06-11"],
    [12, None, "thirty", "false", "2024-06-12"],
    [13, "Jack", 32, "maybe", "2024-06-13"],
    [14, "Kim", 27, "true", None],
    [15, "Luke", 33, "false", "2024-06-15"],
    [16, "Mia", 31, "true", "2024-06-16"],
    [17, "Nina", 27, "false", "2024-06-17"],
    [18, "Oscar", "NaN", "true", "2024-06-18"],
    [19, "Paul", 29, "true", "2024-06-19"]
]
columns = ["id", "name", "age", "is_active", "date"]

pdf = pd.DataFrame(data, columns=columns)

# Save it locally
local_path = "/tmp/my_customers.csv"
pdf.to_csv(local_path, index=False)

# Use HDFS command-line to put it in HDFS
import os
os.system(f"hdfs dfs -put -f {local_path} /data/malformed_customers.csv")


0

In [60]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType
schema = StructType([
    StructField('id', IntegerType(), True),
    StructField('name', StringType(), True),
    StructField('age', IntegerType(), True),
    StructField('is_active', BooleanType(), True),
    StructField('date', StringType(), True)
])

# If we wont do schema enforcement, then spark can read the malformed data and won't show any error and won't skip any data as well
# Without schema enforcement, Spark just reads everything as strings, and it doesn’t know anything’s “malformed”!

In [61]:
df = spark.read.format('csv').option('header', True).schema(schema).option('mode', 'DROPMALFORMED').load('/data/malformed_customers.csv')

In [62]:
df.show()

+---+-------+----+---------+----------+
| id|   name| age|is_active|      date|
+---+-------+----+---------+----------+
|  1|   John|  25|     true|2024-06-01|
|  2|  Alice|  30|    false|2024-06-02|
|  4|Charlie|  40|     NULL|2024-06-04|
|  6|    Eve|NULL|    false|2024-06-06|
|  7|   NULL|  28|     true|2024-06-07|
|  9|   Gina|  45|     true|2024-06-09|
| 10|   Hank|  28|    false|      NULL|
| 11|    Ivy|  29|     NULL|2024-06-11|
| 14|    Kim|  27|     true|      NULL|
| 15|   Luke|  33|    false|2024-06-15|
| 16|    Mia|  31|     true|2024-06-16|
| 17|   Nina|  27|    false|2024-06-17|
| 19|   Paul|  29|     true|2024-06-19|
+---+-------+----+---------+----------+



In [63]:
spark.stop()

# Spark Read Modes

| Mode          | Behavior                                      | When to use                           |
| ------------- | --------------------------------------------- | ------------------------------------- |
| PERMISSIVE    | Loads everything; bad records get `null`      | Load as much data as possible         |
| DROPMALFORMED | Skips malformed rows entirely                 | Ignore bad data and load good records |
| FAILFAST      | Throws an error and stops reading immediately | Ensure data is 100% clean and valid   |