In [0]:
df_order_items = spark.read.csv(
    "/Volumes/workspace/retail_schema/raw/OrderItems.csv",
    header=True,
    inferSchema=True
)

In [0]:
df_order_items.show(5)

+------------+------------+------------+------+----------------+
|    order_id|  product_id|   seller_id| price|shipping_charges|
+------------+------------+------------+------+----------------+
|Axfy13Hk4PIk|90K0C1fIyQUf|ZWM05J9LcBSF|223.51|           84.65|
|v6px92oS8cLG|qejhpMGGVcsl|IjlpYfhUbRQs| 170.8|           23.79|
|Ulpf9skrhjfm|qUS5d2pEAyxJ|77p2EYxcM9MD|  64.4|           17.38|
|bwJVWupf2keN|639iGvMyv0De|jWzS0ayv9TGf| 264.5|           30.72|
|Dd0QnrMk9Cj5|1lycYGcsic2F|l1pYW6GBnPMr| 779.9|           30.66|
+------------+------------+------------+------+----------------+
only showing top 5 rows


In [0]:
df_order_items.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- price: double (nullable = true)
 |-- shipping_charges: double (nullable = true)



In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType, DecimalType, DoubleType, FloatType

order_items_schema = StructType([
    StructField("order_id", StringType(), nullable=False),
    StructField("product_id", StringType(), nullable=False),
    StructField("seller_id", StringType(), nullable=False),
    StructField("price", DoubleType(), nullable=False),
    StructField("shipping_charges", DoubleType(), nullable=False),
])


In [0]:
df_order_items = (
    spark.read
    .option("header", "true")
    .schema(order_items_schema)
    .csv("/Volumes/workspace/retail_schema/raw/OrderItems.csv")
)


In [0]:
df_order_items.show(5)

+------------+------------+------------+------+----------------+
|    order_id|  product_id|   seller_id| price|shipping_charges|
+------------+------------+------------+------+----------------+
|Axfy13Hk4PIk|90K0C1fIyQUf|ZWM05J9LcBSF|223.51|           84.65|
|v6px92oS8cLG|qejhpMGGVcsl|IjlpYfhUbRQs| 170.8|           23.79|
|Ulpf9skrhjfm|qUS5d2pEAyxJ|77p2EYxcM9MD|  64.4|           17.38|
|bwJVWupf2keN|639iGvMyv0De|jWzS0ayv9TGf| 264.5|           30.72|
|Dd0QnrMk9Cj5|1lycYGcsic2F|l1pYW6GBnPMr| 779.9|           30.66|
+------------+------------+------------+------+----------------+
only showing top 5 rows


In [0]:
df_order_items.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- price: double (nullable = true)
 |-- shipping_charges: double (nullable = true)



In [0]:
from pyspark.sql.functions import col

df_order_items = df_order_items.filter(
    col("order_id").isNotNull()
)


In [0]:
df_order_items.write \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .mode("overwrite") \
    .save("/Volumes/workspace/retail_schema/staging/OrderItems")
