#DATASET - 1

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Day 18 Exercise").getOrCreate()

In [17]:
from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, LongType , ArrayType ,
    MapType
)

raw_users = [
("U001","Amit","29","Hyderabad","50000"),
("U002","Neha","Thirty Two","Delhi","62000"),
("U003","Ravi",None,"Bangalore","45k"),
("U004","Pooja","28","Mumbai",58000),
("U005",None,"31","Chennai","")
]

#Exercise 1 - Design a StructType schema for this data

In [26]:
raw_users_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("city", StringType(), True),
    StructField("salary", StringType(), True)
])


#Exercise 2 - Load the data using the schema

In [None]:
df_raw_users  =   spark.createDataFrame(data=raw_users, schema=raw_users_schema)
df_raw_users.show()

#Exercise 3 - Identify records that fail type conversion

In [None]:

from pyspark.sql.functions import expr, col
df_check = (
    df_raw_users
    .withColumn("age_int", expr("try_cast(age as int)"))
    .withColumn("salary_int", expr("try_cast(salary as int)"))
)
df_failed = df_check.filter(col("age_int").isNull() | col("salary_int").isNull())
df_failed.show()

#Exercise 4 - Convert age to integer safely

In [34]:
df_age_conversion = df_raw_users.withColumn("age_int", expr("try_cast(age as int)"))

#Exercise 5 - Normalize salary into integer (handle k )

#Exercise 6 - Replace missing names with "UNKNOWN"

In [None]:

from pyspark.sql.functions import trim , when, lit
df_named = df_check.withColumn(
    "name_fixed", when(col("name").isNull() , lit("UNKNOWN"))
    .otherwise(col("name"))
)
df_named.show()

#Exercise 7 - Drop records where age cannot be recovered

In [37]:
df_age_valid = df_named.filter(col("age_int").isNotNull())

#Exercise 8 - Produce a final clean DataFrame

In [None]:
df_clean = (
    df_age_valid
    .select(
        col("user_id").alias("user_id"),
        col("name_fixed").alias("name"),
        col("age_int").alias("age"),
        col("city").alias("city"),
        col("salary_int").alias("salary")
    )
)
df_clean.show(truncate=False)

#DATASET - 2

In [40]:
raw_orders = [
("O001","U001","Laptop,Mobile,Tablet",75000),
("O002","U002",["Mobile","Tablet"],32000),
("O003","U003","Laptop",72000),
("O004","U004",None,25000),
("O005","U005","Laptop|Mobile",68000)
]

#Exercise 1 - Define a schema with ArrayType

In [42]:
orders_schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("user_id", StringType(), False),
    StructField("items", StringType(), True),
    StructField("amount", IntegerType(), True)
])

In [None]:
df_orders_raw = spark.createDataFrame(raw_orders, schema=orders_schema)
df_orders_raw.show(truncate=False)

#Exercise 2 - Normalize all item values into arrays

In [45]:
from pyspark.sql.functions import col, split, regexp_replace, coalesce, lit
# Replace null with empty string, normalize delimiters (| â†’ ,), then split
df_orders = (
    df_orders_raw
    .withColumn("items_clean",split(regexp_replace(coalesce(col("items"), lit("")), r"\|", ","), ",")
    )
)

#Exercise 3 - Handle multiple delimiters

In [47]:
from pyspark.sql.functions import col, regexp_replace, split, coalesce, lit
df_orders1 = (
    df_orders_raw
    .withColumn(
               "items_clean",
        split(
            regexp_replace(coalesce(col("items"), lit("")), r"[|]", ","),  # replace | with ,
            ","
        )
    )
)

#Exercise 4 - Replace null items with empty arrays

In [48]:

from pyspark.sql.functions import col, regexp_replace, split, coalesce, lit, array

df_orders2 = (
    df_orders_raw
    .withColumn(
               "items_clean",
        when(col("items").isNull(), array())  # empty array if NULL
        .otherwise(
            split(
                regexp_replace(col("items"), "[|]", ","),  # normalize delimiters
                ","
            )
        )
    )
)

#Exercise 5 - Explode items into one row per item

In [50]:

from pyspark.sql.functions import explode_outer

df_exploded = df_orders.select(
    "order_id", "user_id", "amount",
    explode_outer(col("items")).alias("item")
)
df_exploded.show()


{"ts": "2025-12-18 09:37:54.408", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve \"explode(items)\" due to data type mismatch: The first parameter requires the (\"ARRAY\" or \"MAP\") type, however \"items\" has the type \"STRING\". SQLSTATE: 42K09", "context": {"file": "java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)", "line": "", "fragment": "explode_outer", "errorClass": "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o494.select.\n: org.apache.spark.sql.AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve \"explode(items)\" due to data type mismatch: The first parameter requires the (\"ARRAY\" or \"MAP\") type, however \"items\" has the type \"STRING\". SQLSTATE: 42K09;\n'Project [order_id#261, user_id#262, amount#264, generatorouter(explode(items#263)) AS item#284]\n+- Project [

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "explode(items)" due to data type mismatch: The first parameter requires the ("ARRAY" or "MAP") type, however "items" has the type "STRING". SQLSTATE: 42K09;
'Project [order_id#261, user_id#262, amount#264, generatorouter(explode(items#263)) AS item#284]
+- Project [order_id#261, user_id#262, items#263, amount#264, split(regexp_replace(coalesce(items#263, ), \|, ,, 1), ,, -1) AS items_clean#279]
   +- LogicalRDD [order_id#261, user_id#262, items#263, amount#264], false
