In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
spark = SparkSession.builder.getOrCreate()

In [6]:
#specifying how the data should be read in 
schema = StructType([ \
                     StructField("customer_id", IntegerType(), True), \
                     StructField("Item_id", IntegerType(), True), \
                     StructField("cost", FloatType(), True)])

# // Read the file as dataframe
df = spark.read.schema(schema).csv("customer-orders.csv")
df.show(5)

+-----------+-------+-----+
|customer_id|Item_id| cost|
+-----------+-------+-----+
|         44|   8602|37.19|
|         35|   5368|65.89|
|          2|   3391|40.64|
|         47|   6694|14.98|
|         29|    680|13.08|
+-----------+-------+-----+
only showing top 5 rows



In [29]:
##Spend by customer
cost_per_customer = df.groupBy('customer_id').sum('cost')

cost_per_customer = cost_per_customer.withColumn('cost',func.round(func.col('sum(cost)'),2 ))\
                    .select('customer_id','cost').sort('cost', ascending = False).show(5)

+-----------+-------+
|customer_id|   cost|
+-----------+-------+
|         68|6375.45|
|         73| 6206.2|
|         39|6193.11|
|         54|6065.39|
|         71|5995.66|
+-----------+-------+
only showing top 5 rows



In [35]:
#better way
df.groupBy('customer_id')\
    .agg(func.round(func.sum('cost'),2).alias('cost'))\
    .sort('cost', ascending = False)\
    .show(5)

+-----------+-------+
|customer_id|   cost|
+-----------+-------+
|         68|6375.45|
|         73| 6206.2|
|         39|6193.11|
|         54|6065.39|
|         71|5995.66|
+-----------+-------+
only showing top 5 rows



In [37]:
spark.stop()