### Alias (별칭) 지정과 다수의 집계 함수 사용하기

In [21]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set('spark.app.name', 'PySpark DataFrame 2')
conf.set('spark.master', 'local[*]')

spark = SparkSession.builder\
        .config(conf = conf)\
        .getOrCreate()

In [None]:
from pyspark.sql.types import StructField, StructType, StringType, FloatType

schema = StructType([
    StructField('cust_id', StringType(), True),
    StructField('item_id', StringType(), True),
    StructField('amount_spent', FloatType(), True)
])

df = spark.read.schema(schema).format('csv').load('customer-orders.csv')

In [26]:
df.printSchema()

df.show(5)

root
 |-- cust_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- amount_spent: float (nullable = true)

+-------+-------+------------+
|cust_id|item_id|amount_spent|
+-------+-------+------------+
|     44|   8602|       37.19|
|     35|   5368|       65.89|
|      2|   3391|       40.64|
|     47|   6694|       14.98|
|     29|    680|       13.08|
+-------+-------+------------+
only showing top 5 rows



In [32]:
df.groupby('cust_id').sum('amount_spent').show(5)

+-------+-----------------+
|cust_id|sum(amount_spent)|
+-------+-----------------+
|     51|4975.219970226288|
|      7|4755.070008277893|
|     15|5413.510010659695|
|     54|6065.390002984554|
|     11|5152.289969373494|
+-------+-----------------+
only showing top 5 rows



In [33]:
df.groupby('cust_id').sum('amount_spent').withColumnRenamed('sum(amount_spent)', 'sum').show(5)

+-------+-----------------+
|cust_id|              sum|
+-------+-----------------+
|     51|4975.219970226288|
|      7|4755.070008277893|
|     15|5413.510010659695|
|     54|6065.390002984554|
|     11|5152.289969373494|
+-------+-----------------+
only showing top 5 rows



In [35]:
import pyspark.sql.functions as f

df.groupby('cust_id').agg(f.sum('amount_spent').alias('sum')).show(5)

+-------+-----------------+
|cust_id|              sum|
+-------+-----------------+
|     51|4975.219970226288|
|      7|4755.070008277893|
|     15|5413.510010659695|
|     54|6065.390002984554|
|     11|5152.289969373494|
+-------+-----------------+
only showing top 5 rows



In [36]:
df.groupby('cust_id').agg(f.sum('amount_spent').alias('sum'),
                          f.avg('amount_spent').alias('avg'),
                          f.max('amount_spent').alias('max')).show(5)

+-------+-----------------+-----------------+-----+
|cust_id|              sum|              avg|  max|
+-------+-----------------+-----------------+-----+
|     51|4975.219970226288|48.77666637476753|97.61|
|      7|4755.070008277893|50.58585115189248| 98.6|
|     15|5413.510010659695|52.05298087172783|99.57|
|     54|6065.390002984554|49.31211384540288|99.23|
|     11|5152.289969373494|47.70638860531013|99.11|
+-------+-----------------+-----------------+-----+
only showing top 5 rows



In [38]:
df.createOrReplaceTempView('customer_orders')

df_sql = spark.sql("""
    SELECT cust_id, SUM(amount_spent) AS sum, AVG(amount_spent) AS avg, MAX(amount_spent) AS max
    FROM customer_orders
    GROUP BY cust_id
""")

In [39]:
df_sql.show(5)

+-------+-----------------+-----------------+-----+
|cust_id|              sum|              avg|  max|
+-------+-----------------+-----------------+-----+
|     51|4975.219970226288|48.77666637476753|97.61|
|      7|4755.070008277893|50.58585115189248| 98.6|
|     15|5413.510010659695|52.05298087172783|99.57|
|     54|6065.390002984554|49.31211384540288|99.23|
|     11|5152.289969373494|47.70638860531013|99.11|
+-------+-----------------+-----------------+-----+
only showing top 5 rows



In [40]:
spark.catalog.listTables()

[Table(name='customer_orders', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]