In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.appName("DAY18").getOrCreate()
raw_users = [
    ("U001","Amit","28","Hyderabad","['AI','ML','Cloud']"),
    ("U002","Neha","Thirty","Delhi","AI,Testing"),
    ("U003","Ravi",None,"Bangalore",["Data","Spark"]),
    ("U004","Pooja","29","Mumbai",None),
    ("U005","", "31","Chennai","['DevOps']")
]


#Exercise 1 - Design an explicit schema using StructType

In [None]:
users_schema = StructType([
    StructField("user_id",StringType(),True),
    StructField("name",StringType(),True),
    StructField("age",StringType(),True),
    StructField("city",StringType(),True),
    StructField("skills",StringType(),True)
])

df_users = spark.createDataFrame(data=raw_users,schema=users_schema)
df_users.printSchema()
df_users.show()

Exercise 2 - Normalize age into IntegerType

In [11]:
from pyspark.sql.functions import expr, col
df_named = (
    df_users
    .withColumn("age_int", expr("try_cast(age as int)"))
)
df_failed = df_check.filter(col("age_int").isNull())
df_failed.show()

+-------+----+------+---------+-------------+-------+
|user_id|name|   age|     city|       skills|age_int|
+-------+----+------+---------+-------------+-------+
|   U002|Neha|Thirty|    Delhi|   AI,Testing|   NULL|
|   U003|Ravi|  NULL|Bangalore|[Data, Spark]|   NULL|
+-------+----+------+---------+-------------+-------+



#Exercise 3 - Normalize skills into ArrayType

In [None]:

from pyspark.sql.functions import col, split, regexp_replace, coalesce, lit, array

df_users1 = (
    df_users
    .withColumn(
        "skills",
        when(col("skills").isNull(), array())  # empty array if NULL
        .otherwise(
            split(
                regexp_replace(coalesce(col("skills"), lit("")), r"[\[\]' ]", ""),  # remove brackets/quotes/spaces
                ","
            )
        )
    )
)

df_users.show(truncate=False)


#Exercise 4 - Handle empty or missing names

In [7]:

from pyspark.sql.functions import when
df_named = df_users1.withColumn(
    "name_clean",
    when(col("name").isNull() | (trim(col("name")) == ""), lit("UNKNOWN"))
    .otherwise(col("name"))
)

#Exercise 5 - Produce a clean users_df

In [None]:

users_df = (
    df_named
    .select(
        col("user_id"),
        col("name_clean").alias("name"),
        col("age_int").alias("age"),
        col("city"),
        col("skills_clean").alias("skills")
    )
)
users_df.show(truncate=False)
users_df.printSchema()
