In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Day 18").getOrCreate()

In [None]:
from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, LongType , ArrayType
)

raw_data = [
    ("U001","Abhishek",28,"Hyderabad",50000),
    ("U002","Neha",32,"Delhi",62000),
    ("U003","Ravi",25,"Bangalore",45000),
    ("U004","Pooja",29,"Mumbai",58000)
]

user_schema = StructType([
    StructField("user_id",StringType(),nullable = False),
    StructField("name",StringType(),nullable = True),
    StructField("age",IntegerType(),nullable = True),
    StructField("city",StringType(),nullable = True),
    StructField("salary",LongType(),nullable = True)
])

df_users = spark.createDataFrame(raw_data, schema = user_schema)
df_users.printSchema()
df_users.show()

In [None]:
raw_data2 = [
    ("UOO5" , "Ankit" , "Thirty" , "Chennai" , 50000)
]
df_users = spark.createDataFrame(raw_data2, schema = user_schema)

#Array Type

In [5]:
interest_data = [
    ("U001",["AI","ML","Cloud"]),
    ("U002",["Testing","Automation"]),
    ("U003",["Data Engineering","Spark","Kafka"]),
    ("U004",["UI/UX"])
]

In [None]:
interest_schema = StructType ([
    StructField("user_id",StringType(), False),
    StructField("interests",ArrayType(StringType()), False)
])

df_interests=spark.createDataFrame(interest_data,schema=interest_schema)
df_interests.printSchema()
df_interests.show(truncate=False)

In [None]:
from pyspark.sql.functions import explode
df_interests.select (
    "user_id",
    explode("interests").alias("interest")
).show()

#Map Type

In [8]:
from pyspark.sql.types import MapType
device_data = [
    ("U001",{"mobile":120,"laptop":300}),
    ("U002",{"tablet":80}),
    ("U003",{"mobile":200,"desktop":400}),
    ("U004",{"laptop":250})
]



In [9]:
device_schema = StructType([
    StructField("user_id",StringType(),False),
    StructField("device_usage",MapType(StringType(),LongType()),False)
])

In [None]:
df_devices = spark.createDataFrame(device_data,schema=device_schema)
df_devices.printSchema()
df_devices.show(truncate=False)

#Nested Data

In [11]:
nested_data = [
    ("U001",("Hyderabad","Telangana",500081)),
    ("U002",("Delhi","Delhi",110001)),
    ("U003",("Bangalore","Karnataka",560001))
]

In [12]:
address_schema = StructType([
    StructField("city",StringType(),True),
    StructField("state",StringType(),True),
    StructField("pincode",IntegerType(),True)
])

profile_schema = StructType([
    StructField("user_id",StringType(),False),
    StructField("address",address_schema,True)
])

In [None]:
df_profiles = spark.createDataFrame(nested_data,schema=profile_schema)
df_profiles.printSchema()
df_profiles.show(truncate=False)

In [None]:
df_profiles.select("user_id","address.city","address.state").show()

#Casting

In [None]:
from pyspark.sql.functions import col

df_users.withColumn(
    "salary_int",
     col("salary").cast("int")
)

#To_Date


In [None]:
from pyspark.sql.functions import to_date

df_orders.withColumn(
    "order_date",
    to_date("order_date","yyyy-MM-dd")
)