In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Sales Analysis").getOrCreate()

sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]

In [2]:
customer_data = [
("C001","Delhi","Premium"),
("C002","Mumbai","Standard"),
("C003","Bangalore","Premium"),
("C004","Chennai","Standard"),
("C005","Mumbai","Premium")
]

In [3]:
city_lookup = [
("Delhi","Tier-1"),
("Mumbai","Tier-1"),
("Bangalore","Tier-1"),
("Chennai","Tier-2")
]

#1.Create schemas explicitly for all datasets

In [6]:
sales_schema = StructType([
    StructField("transaction_id",StringType(),False),
    StructField("city",StringType(),True),
    StructField("product_name",StringType(),True),
    StructField("category",StringType(),True),
    StructField("price",StringType(),True),
    StructField("order_date",StringType(),True),
    StructField("status",StringType(),True)
])

customer_schema = StructType([
    StructField("customer_id",StringType(),True),
    StructField("city",StringType(),True),
    StructField("membership_status",StringType(),True)
])

city_schema =  StructType([
    StructField("city",StringType(),True),
    StructField("tier",StringType(),True)
])

#2. Load raw data into DataFrames

In [7]:
raw_sales_df = spark.createDataFrame(sales_data, schema = sales_schema)
raw_customer_df = spark.createDataFrame(customer_data, schema = customer_schema)
raw_city_df = spark.createDataFrame(city_lookup, schema = city_schema)



#3. Handle incorrect data types gracefully

In [10]:

from pyspark.sql import functions as F, types as T

# Clean amount: remove non-numeric and cast
price_clean = F.regexp_replace(F.col("price"), r"[^0-9.]", "")
amount_decimal = F.when(price_clean != "", price_clean.cast(T.DecimalType(10,2)))

# Parse date with multiple formats
order_date_clean = F.coalesce(
    F.to_date("order_date", "yyyy-MM-dd"),
    F.to_date("order_date", "dd/MM/yyyy"),
    F.to_date("order_date", "dd-MM-yyyy")
)

df = raw_sales_df.withColumn("amount_decimal", amount_decimal) \
       .withColumn("txn_date_clean", order_date_clean)
