In [1]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder.getOrCreate()


22/11/01 14:33:16 WARN Utils: Your hostname, kevin resolves to a loopback address: 127.0.1.1; using 192.168.1.6 instead (on interface wlp0s20f3)
22/11/01 14:33:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/01 14:33:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.format('csv')\
                .option('header', 'true')\
                .option('inferSchema', 'true')\
                .load('/home/kevin/Desktop/Big-Data-with-Pyspark/data/retail-data/all/*.csv')\
                .coalesce(5)


df.cache()

                                                                                

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

# Aggregate Functions

## count

In [3]:
from pyspark.sql.functions import count

df.select(count('StockCode')).show()



+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



                                                                                

## countDistinct

In [5]:
from pyspark.sql.functions import countDistinct

df.select(countDistinct('StockCode')).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



## approx_count_distinct

In [7]:
from pyspark.sql.functions import approx_count_distinct

df.select(approx_count_distinct('StockCode', 0.05)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3804|
+--------------------------------+



## first and last

In [8]:
from pyspark.sql.functions import first, last
df.select(first('StockCode'), last('StockCode')).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



# Grouping

In [10]:
df.groupBy('InvoiceNo', 'CustomerId').count().show(5)

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
+---------+----------+-----+
only showing top 5 rows



## Grouping with Expressions

In [19]:
from pyspark.sql.functions import expr, stddev_pop, avg
df.groupBy('InvoiceNo').agg(
    count('Quantity').alias('quan'),
    expr('count(Quantity)')
).show(5)

+---------+----+---------------+
|InvoiceNo|quan|count(Quantity)|
+---------+----+---------------+
|   536596|   6|              6|
|   536938|  14|             14|
|   537252|   1|              1|
|   537691|  20|             20|
|   538041|   1|              1|
+---------+----+---------------+
only showing top 5 rows



## Grouping with Maps

In [20]:

df.groupBy('InvoiceNo').agg(
    expr('avg(Quantity)'),
    expr('stddev_pop(Quantity)')
).show(4)

+---------+------------------+--------------------+
|InvoiceNo|     avg(Quantity)|stddev_pop(Quantity)|
+---------+------------------+--------------------+
|   536596|               1.5|  1.1180339887498947|
|   536938|33.142857142857146|  20.698023172885524|
|   537252|              31.0|                 0.0|
|   537691|              8.15|   5.597097462078001|
+---------+------------------+--------------------+
only showing top 4 rows



## Window functions

In [27]:
from pyspark.sql.functions import col, to_date, desc, max, dense_rank, rank

dfWithDate = df.withColumn('date', to_date(col('InvoiceDate'), 'MM/d/yyy H:mm'))
dfWithDate.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|      date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|2010-12-01|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|2010-12-01|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|2010-12-01|
+---------+---------+--------------------+--------+--------------+---------+----------+-

In [23]:
from pyspark.sql.window import Window

windowSpec = Window\
                .partitionBy('CustomerId', 'date')\
                .orderBy(desc('Quantity'))\
                .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [28]:
# check the maximum purchase quantity over all time
maxPurchaseQuantity = max(col('Quantity')).over(windowSpec)

# create the purchase quantity rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [31]:
dfWithDate.where('CustomerId IS NOT NULL').orderBy('CustomerId')\
        .select(
            col('customerId'),
            # col('date'),
            col('Quantity'),
            purchaseRank.alias('quantityRank'),
            purchaseDenseRank.alias('quantityDenseRank'),
            maxPurchaseQuantity.alias('maxPurchaseQuantity')
        )

DataFrame[customerId: int, Quantity: int, quantityRank: int, quantityDenseRank: int, maxPurchaseQuantity: int]

In [32]:
spark.stop()