#### This notebook is about advanced transformations and windowing in pyspark

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

In [3]:
productPath = "/home/solverbot/spark-warehouse/retail_db/products/part-00000"
orderitemPath = "/home/solverbot/spark-warehouse/retail_db/order_items/part-00000"
ordersPath = "/home/solverbot/spark-warehouse/retail_db/orders/part-00000.txt"

In [4]:
#What is the difference between Session and Context?
#SC is part of the Spark session that is established above
spark = SparkSession.builder.appName('Adv Transformations').getOrCreate()

22/11/22 07:07:21 WARN Utils: Your hostname, codeStation resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
22/11/22 07:07:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/22 07:07:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sc = spark.sparkContext
sc

In [6]:
from pyspark.sql.functions import *

In [7]:
orderItemDF = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/order_items/",inferSchema=True). \
            toDF("order_item_id","order_item_order_id","product_id", "qty","product_cost","order_subtotal")

                                                                                

In [15]:
orderDF = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/orders",inferSchema=True) \
            .toDF("order_id","order_date","order_customer_id","order_status")

In [16]:
orderDF.show(2)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 2 rows



In [26]:
orderItemDF.show(2)

+-------------+-------------------+----------+---+------------+--------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|
+-------------+-------------------+----------+---+------------+--------------+
|            1|                  1|       957|  1|      299.98|        299.98|
|            2|                  2|      1073|  1|      199.99|        199.99|
+-------------+-------------------+----------+---+------------+--------------+
only showing top 2 rows



In [9]:
from pyspark.sql.window import *

In [10]:
help(pyspark.sql.window)

Help on module pyspark.sql.window in pyspark.sql:

NAME
    pyspark.sql.window

DESCRIPTION
    # Licensed to the Apache Software Foundation (ASF) under one or more
    # contributor license agreements.  See the NOTICE file distributed with
    # this work for additional information regarding copyright ownership.
    # The ASF licenses this file to You under the Apache License, Version 2.0
    # (the "License"); you may not use this file except in compliance with
    # the License.  You may obtain a copy of the License at
    #
    #    http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.
    #

CLASSES
    builtins.object
        Window
        WindowSpec
   

In [11]:
help(Window)

Help on class Window in module pyspark.sql.window:

class Window(builtins.object)
 |  Utility functions for defining window in DataFrames.
 |  
 |  .. versionadded:: 1.4
 |  
 |  Notes
 |  -----
 |  When ordering is not defined, an unbounded window frame (rowFrame,
 |  unboundedPreceding, unboundedFollowing) is used by default. When ordering is defined,
 |  a growing window frame (rangeFrame, unboundedPreceding, currentRow) is used by default.
 |  
 |  Examples
 |  --------
 |  >>> # ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
 |  >>> window = Window.orderBy("date").rowsBetween(Window.unboundedPreceding, Window.currentRow)
 |  
 |  >>> # PARTITION BY country ORDER BY date RANGE BETWEEN 3 PRECEDING AND 3 FOLLOWING
 |  >>> window = Window.orderBy("date").partitionBy("country").rangeBetween(-3, 3)
 |  
 |  Static methods defined here:
 |  
 |  orderBy(*cols: Union[ForwardRef('ColumnOrName'), List[ForwardRef('ColumnOrName_')]]) -> 'WindowSpec'
 |      Creates a :class:`W

In [12]:
help(WindowSpec)

Help on class WindowSpec in module pyspark.sql.window:

class WindowSpec(builtins.object)
 |  WindowSpec(jspec: py4j.java_gateway.JavaObject) -> None
 |  
 |  A window specification that defines the partitioning, ordering,
 |  and frame boundaries.
 |  
 |  Use the static methods in :class:`Window` to create a :class:`WindowSpec`.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  Methods defined here:
 |  
 |  __init__(self, jspec: py4j.java_gateway.JavaObject) -> None
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  orderBy(self, *cols: Union[ForwardRef('ColumnOrName'), List[ForwardRef('ColumnOrName_')]]) -> 'WindowSpec'
 |      Defines the ordering columns in a :class:`WindowSpec`.
 |      
 |      .. versionadded:: 1.4.0
 |      
 |      Parameters
 |      ----------
 |      cols : str, :class:`Column` or list
 |          names of columns or expressions
 |  
 |  partitionBy(self, *cols: Union[ForwardRef('ColumnOrName'), List[ForwardRef('ColumnOrName_')]]) -> '

In [19]:
o = orderDF.alias("o")
oi = orderItemDF.alias('oi')

In [22]:
#Creating temp view
oi.groupBy("product_id").agg(sum('order_subtotal').alias('product_revn')).createTempView('product_revenue')

In [25]:
spark.sql("SELECT * FROM product_revenue").show(2)

+----------+-----------------+
|product_id|     product_revn|
+----------+-----------------+
|       897|7022.189999999962|
|       804| 6136.92999999996|
+----------+-----------------+
only showing top 2 rows



In [26]:
order_revenue = oi.groupBy("product_id").agg(sum('order_subtotal').alias('product_revn')).alias("order_revenue")

In [27]:
order_revenue.show(2)

+----------+-----------------+
|product_id|     product_revn|
+----------+-----------------+
|       897|7022.189999999962|
|       804| 6136.92999999996|
+----------+-----------------+
only showing top 2 rows



In [41]:
products = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/products/",inferSchema=True) \
                .toDF("p_id","categ_id","pdt_name","c3","pdt_cost","pdt_link")
products = products.drop("c3")
products.show(2)

+----+--------+--------------------+--------+--------------------+
|p_id|categ_id|            pdt_name|pdt_cost|            pdt_link|
+----+--------+--------------------+--------+--------------------+
|   1|       2|Quest Q64 10 FT. ...|   59.98|http://images.acm...|
|   2|       2|Under Armour Men'...|  129.99|http://images.acm...|
+----+--------+--------------------+--------+--------------------+
only showing top 2 rows



In [43]:
joinPdtOrder = order_revenue.join(products, products.p_id == order_revenue.product_id).drop('p_id')
joinPdtOrder.show(2)

+----------+-----------------+--------+--------------------+--------+--------------------+
|product_id|     product_revn|categ_id|            pdt_name|pdt_cost|            pdt_link|
+----------+-----------------+--------+--------------------+--------+--------------------+
|       897|7022.189999999962|      40|Team Golf New Eng...|   24.99|http://images.acm...|
|       804| 6136.92999999996|      36|Glove It Women's ...|   19.99|http://images.acm...|
+----------+-----------------+--------+--------------------+--------+--------------------+
only showing top 2 rows



In [60]:
categoryTables = joinPdtOrder.groupBy('categ_id').agg(count(col("product_id")).alias('CategCount')) \
    .orderBy(col('CategCount').desc())

In [61]:
categoryTables.count()

33

In [66]:
spec = Window.partitionBy(joinPdtOrder.categ_id)

In [67]:
type(spec)

pyspark.sql.window.WindowSpec

In [70]:
joinPdtOrder.where('categ_id = 2').show()

+----------+-----------------+--------+--------------------+--------+--------------------+
|product_id|     product_revn|categ_id|            pdt_name|pdt_cost|            pdt_link|
+----------+-----------------+--------+--------------------+--------+--------------------+
|        19| 7999.35999999999|       2|Nike Men's Finger...|  124.99|http://images.acm...|
|        24|5919.259999999989|       2|Elevation Trainin...|   79.99|http://images.acm...|
+----------+-----------------+--------+--------------------+--------+--------------------+





In [73]:
joinPdtOrder.select("categ_id", sum(joinPdtOrder.product_revn). \
                    over(spec). \
                    alias('categ_window')). \
                    distinct(). \
                    orderBy('categ_window'). \
                    show()

+--------+------------------+
|categ_id|      categ_window|
+--------+------------------+
|      16|  6799.31999999999|
|      34| 10369.38999999999|
|       5|13296.569999999952|
|       2|13918.619999999979|
|       6|14756.719999999943|
|      11|16461.909999999953|
|       7|22660.729999999996|
|      32| 23295.58999999994|
|      31|24188.189999999973|
|      44| 24329.16999999994|
|      36| 25924.24999999983|
|       4| 27099.32999999999|
|      12| 28365.02999999991|
|      30|33337.169999999955|
|      41| 35362.25999999991|
|       3| 37863.67999999992|
|      13| 38688.72999999983|
|      33|           40873.0|
|      40| 44482.19999999975|
|      26|50141.999999999956|
+--------+------------------+
only showing top 20 rows



In [80]:
joinPdtOrder.withColumn("categ_sum", round(sum(joinPdtOrder.product_revn).over(spec))) \
            .withColumn("categ_avg", round(avg(joinPdtOrder.product_revn).over(spec))) \
            .show(3)

+----------+------------------+--------+--------------------+--------+--------------------+---------+---------+
|product_id|      product_revn|categ_id|            pdt_name|pdt_cost|            pdt_link|categ_sum|categ_avg|
+----------+------------------+--------+--------------------+--------+--------------------+---------+---------+
|        19|  7999.35999999999|       2|Nike Men's Finger...|  124.99|http://images.acm...|  13919.0|   6959.0|
|        24| 5919.259999999989|       2|Elevation Trainin...|   79.99|http://images.acm...|  13919.0|   6959.0|
|        44|18296.949999999964|       3|adidas Men's F10 ...|   59.99|http://images.acm...|  37864.0|  12621.0|
+----------+------------------+--------+--------------------+--------+--------------------+---------+---------+
only showing top 3 rows



In [81]:
orderItemDF.show(5)

+-------------+-------------------+----------+---+------------+--------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|
+-------------+-------------------+----------+---+------------+--------------+
|            1|                  1|       957|  1|      299.98|        299.98|
|            2|                  2|      1073|  1|      199.99|        199.99|
|            3|                  2|       502|  5|       250.0|          50.0|
|            4|                  2|       403|  1|      129.99|        129.99|
|            5|                  4|       897|  2|       49.98|         24.99|
+-------------+-------------------+----------+---+------------+--------------+
only showing top 5 rows



In [82]:
order_spec = Window.partitionBy('order_item_order_id'). \
                    orderBy(orderItemDF.order_subtotal.desc())

In [83]:
orderItemDF.withColumn('next_revenue',
                    lead('order_subtotal').over(order_spec)) \
                    .show()



+-------------+-------------------+----------+---+------------+--------------+------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|next_revenue|
+-------------+-------------------+----------+---+------------+--------------+------------+
|            1|                  1|       957|  1|      299.98|        299.98|        null|
|            9|                  5|       957|  1|      299.98|        299.98|      299.98|
|           12|                  5|       957|  1|      299.98|        299.98|      129.99|
|           13|                  5|       403|  1|      129.99|        129.99|       59.99|
|           10|                  5|       365|  5|      299.95|         59.99|       49.98|
|           11|                  5|      1014|  2|       99.96|         49.98|        null|
|           34|                 12|       957|  1|      299.98|        299.98|       99.99|
|           37|                 12|       191|  5|      499.95|         99.99|  

                                                                                

In [86]:
rank_spec = Window.partitionBy(joinPdtOrder.categ_id).orderBy(joinPdtOrder.product_revn)

In [84]:
joinPdtOrder.show(2)

+----------+-----------------+--------+--------------------+--------+--------------------+
|product_id|     product_revn|categ_id|            pdt_name|pdt_cost|            pdt_link|
+----------+-----------------+--------+--------------------+--------+--------------------+
|       897|7022.189999999962|      40|Team Golf New Eng...|   24.99|http://images.acm...|
|       804| 6136.92999999996|      36|Glove It Women's ...|   19.99|http://images.acm...|
+----------+-----------------+--------+--------------------+--------+--------------------+
only showing top 2 rows



In [87]:
pdtOrderRevnueRanked = joinPdtOrder.withColumn("rnk", rank().over(rank_spec))

In [90]:
pdtOrderRevnueRanked.show(2)

+----------+-----------------+--------+--------------------+--------+--------------------+---+
|product_id|     product_revn|categ_id|            pdt_name|pdt_cost|            pdt_link|rnk|
+----------+-----------------+--------+--------------------+--------+--------------------+---+
|        24|5919.259999999989|       2|Elevation Trainin...|   79.99|http://images.acm...|  1|
|        19| 7999.35999999999|       2|Nike Men's Finger...|  124.99|http://images.acm...|  2|
+----------+-----------------+--------+--------------------+--------+--------------------+---+
only showing top 2 rows



In [92]:
pdtOrderRevnueRanked.where('rnk <= 2').show(5)

+----------+-----------------+--------+--------------------+--------+--------------------+---+
|product_id|     product_revn|categ_id|            pdt_name|pdt_cost|            pdt_link|rnk|
+----------+-----------------+--------+--------------------+--------+--------------------+---+
|        24|5919.259999999989|       2|Elevation Trainin...|   79.99|http://images.acm...|  1|
|        19| 7999.35999999999|       2|Nike Men's Finger...|  124.99|http://images.acm...|  2|
|        37|9167.379999999963|       3|adidas Kids' F5 M...|   34.99|http://images.acm...|  1|
|        35|10399.34999999999|       3|adidas Brazuca 20...|  159.99|http://images.acm...|  2|
|        61|8399.719999999996|       4|Diamondback Girls...|  299.99|http://images.acm...|  1|
+----------+-----------------+--------+--------------------+--------+--------------------+---+
only showing top 5 rows

