In [2]:
df = spark.read.format("csv").option("header", "true").load("Files/Bronze/*.csv")
display(df.limit(10))

StatementMeta(, 7e1ac899-501a-4639-b5d4-f9c6628b81b6, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 07174261-706b-494a-829f-081639ee829e)

In [6]:
from pyspark.sql.types import *
 
orderSchema = StructType([
     StructField("SalesOrderNumber", StringType()),
     StructField("SalesOrderLineNumber", IntegerType()),
     StructField("OrderDate", DateType()),
     StructField("CustomerName", StringType()),
     StructField("Email", StringType()),
     StructField("Item", StringType()),
     StructField("Quantity", IntegerType()),
     StructField("UnitPrice", FloatType()),
     StructField("Tax", FloatType())
     ])
    
 # Import all files from bronze folder of lakehouse
df = spark.read.format("csv").option("header", "true").schema(orderSchema).load("Files/Bronze/*.csv")

StatementMeta(, 7e1ac899-501a-4639-b5d4-f9c6628b81b6, 8, Finished, Available, Finished)

In [7]:
display(df.limit(10))

StatementMeta(, 7e1ac899-501a-4639-b5d4-f9c6628b81b6, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 2c1793bf-ac42-4b0f-944b-95e605b3c17c)

In [8]:
# Summary statistics
df.describe(["Quantity", "UnitPrice", "Tax"]).show()

StatementMeta(, 7e1ac899-501a-4639-b5d4-f9c6628b81b6, 10, Finished, Available, Finished)

+-------+--------------------+------------------+-----------------+
|summary|            Quantity|         UnitPrice|              Tax|
+-------+--------------------+------------------+-----------------+
|  count|               32715|             32715|            32715|
|   mean|                 1.0|   639.51526513217|51.16122212253651|
| stddev|3.709011772679747...|1071.9454951780597|85.75564001513405|
|    min|                   1|              2.29|           0.1832|
|    max|                   1|           3578.27|         286.2616|
+-------+--------------------+------------------+-----------------+



In [9]:
# Distinct items
distinct_items = df.select("Item").distinct()
distinct_items.show()

StatementMeta(, 7e1ac899-501a-4639-b5d4-f9c6628b81b6, 11, Finished, Available, Finished)

+--------------------+
|                Item|
+--------------------+
|Mountain-200 Blac...|
|Touring-1000 Yell...|
|Touring-1000 Blue...|
|Short-Sleeve Clas...|
|Women's Mountain ...|
|Long-Sleeve Logo ...|
|Mountain-400-W Si...|
|     Racing Socks, M|
|Mountain-200 Silv...|
|  Road-750 Black, 58|
|Half-Finger Glove...|
|Road-350-W Yellow...|
|Mountain-400-W Si...|
|Mountain Bottle Cage|
|Touring-1000 Blue...|
|Mountain-500 Silv...|
|    HL Mountain Tire|
|Mountain-400-W Si...|
|Bike Wash - Disso...|
|Road-550-W Yellow...|
+--------------------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import col, sum as _sum

# Total sales per customer
df = df.withColumn("TotalSales", col("Quantity") * col("UnitPrice"))
total_sales_per_customer = df.groupBy("CustomerName").agg(_sum("TotalSales").alias("TotalSales"))
total_sales_per_customer.show()

StatementMeta(, 7e1ac899-501a-4639-b5d4-f9c6628b81b6, 12, Finished, Available, Finished)

+----------------+------------------+
|    CustomerName|        TotalSales|
+----------------+------------------+
| Brianna Sanchez|  4366.40966796875|
|      Peter Deng|         2181.5625|
|   Kaitlyn James|1347.9699802398682|
|       Edwin Lal|  782.989990234375|
|       Jorge Sun| 2071.419677734375|
|    Jocelyn Wood| 2071.419677734375|
|     Brandy Arun| 776.3299751281738|
|  Amanda Sanders| 2487.890095949173|
|  Clarence Huang| 3498.777599334717|
|  Elizabeth Wood|2881.7396545410156|
|   Christine Pal| 6121.710115671158|
|    Luke Coleman|119.95000076293945|
|Jessica Peterson| 81.93999862670898|
|      Roger Chen|48.970001220703125|
|       Austin Li|  12.9399995803833|
|    Jasmine Long|161.28999996185303|
|      Bruce Suri| 3964.469980239868|
|      Eric Adams| 71.58000016212463|
|   Isabella Ward|  8346.28007888794|
|     Anna Flores|13.979999542236328|
+----------------+------------------+
only showing top 20 rows

