**Using Pyspark functions:**

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

# Build a spark session
spark = SparkSession.builder.appName("customer_orders").getOrCreate()

# As the csv file does not contain schema(i.e. the column names and their types), we need to manually set the schema
schema = StructType([StructField("CustomerID", IntegerType(), nullable = True),
                    StructField("ItemID", IntegerType(), nullable = True),
                    StructField("ItemPrice", FloatType(), nullable = True)])

df = spark.read.schema(schema).csv(r"Datasets\customer-orders.csv")
df.printSchema()

total_spent = df.select("CustomerID", "ItemPrice").groupBy("CustomerID").sum("ItemPrice").sort("sum(ItemPrice)")
total_spent.show(n=total_spent.count()) # Show the total number of rows
spark.stop()

root
 |-- CustomerID: integer (nullable = true)
 |-- ItemID: integer (nullable = true)
 |-- ItemPrice: float (nullable = true)

+----------+------------------+
|CustomerID|    sum(ItemPrice)|
+----------+------------------+
|        45|3309.3800055980682|
|        79| 3790.569982469082|
|        96|3924.2299877405167|
|        23| 4042.650001913309|
|        99| 4172.290024012327|
|        75| 4178.499995291233|
|        36| 4278.049998521805|
|        98| 4297.259994864464|
|        47| 4316.299998342991|
|        77| 4327.730022907257|
|        13| 4367.619992315769|
|        48| 4384.329996109009|
|        49| 4394.599998474121|
|        94| 4475.569978475571|
|        67|  4505.78999453038|
|        50| 4517.269991545007|
|        78| 4524.510001778603|
|         5|4561.0700044333935|
|        57| 4628.399988114834|
|        83| 4635.799997210503|
|        91| 4642.259980916977|
|        74| 4647.129976034164|
|        84| 4652.939991295338|
|         3| 4659.629958629608|
|       

**Using SQL code:**

In [6]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
spark = SparkSession.builder.appName("customer_orders").getOrCreate()
schema = StructType([StructField("CustomerID", IntegerType(), nullable=True),
                     StructField("ItemID", IntegerType(), nullable=True),
                     StructField("ItemPrice", FloatType(), nullable=True)])
df = spark.read.csv(r"Datasets\customer-orders.csv", schema=schema)
# Create Dataframe and name it as "customer_orders"
df.createOrReplaceTempView("customer_orders")
# Use SQL code to extract information
total_spent = spark.sql("SELECT CustomerID, SUM(ItemPrice) as TotalSpent FROM customer_orders GROUP BY CustomerID ORDER BY TotalSpent DESC")
total_spent.show()
spark.stop()

+----------+------------------+
|CustomerID|        TotalSpent|
+----------+------------------+
|        68| 6375.450028181076|
|        73| 6206.199985742569|
|        39| 6193.109993815422|
|        54| 6065.390002984554|
|        71| 5995.659991919994|
|         2| 5994.589979887009|
|        97| 5977.190007060766|
|        46| 5963.110011339188|
|        42| 5696.840004444122|
|        59| 5642.890004396439|
|        41| 5637.619991332293|
|         0| 5524.950008839369|
|         8|5517.2399980425835|
|        85|  5503.42998456955|
|        61| 5497.479998707771|
|        32| 5496.049998283386|
|        58| 5437.730004191399|
|        63| 5415.150004655123|
|        15| 5413.510010659695|
|         6| 5397.880012750626|
+----------+------------------+
only showing top 20 rows

