#### This notebook is about the Transformation to be done on DFs

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

In [2]:
employees = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

In [3]:
productPath = "/home/solverbot/spark-warehouse/retail_db/products/part-00000"
orderitemPath = "/home/solverbot/spark-warehouse/retail_db/order_items/part-00000"
ordersPath = "/home/solverbot/spark-warehouse/retail_db/orders/part-00000.txt"

In [4]:
#What is the difference between Session and Context?
#SC is part of the Spark session that is established above
spark = SparkSession.builder.appName('DF Tranformations').getOrCreate()

22/11/22 04:41:09 WARN Utils: Your hostname, codeStation resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
22/11/22 04:41:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/22 04:41:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
sc = spark.sparkContext
sc

In [7]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [8]:
from pyspark.sql.functions import *

In [9]:
employeesDF. \
    groupBy(upper(col("nationality")).alias('upper_nationality')). \
    count(). \
    show()



+-----------------+-----+
|upper_nationality|count|
+-----------------+-----+
|    UNITED STATES|    1|
|            INDIA|    1|
|   UNITED KINGDOM|    1|
|        AUSTRALIA|    1|
+-----------------+-----+



                                                                                

In [10]:
orderItemDF = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/order_items/",inferSchema=True).toDF("order_item_id","order_item_order_id","product_id", "qty","product_cost","order_subtotal")

                                                                                

In [11]:
orderDF = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/orders",inferSchema=True) \
            .toDF("order_id","order_date","order_customer_id","order_status")

                                                                                

In [27]:
orderDF.show(2)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 2 rows



In [26]:
orderItemDF.show(2)

+-------------+-------------------+----------+---+------------+--------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|
+-------------+-------------------+----------+---+------------+--------------+
|            1|                  1|       957|  1|      299.98|        299.98|
|            2|                  2|      1073|  1|      199.99|        199.99|
+-------------+-------------------+----------+---+------------+--------------+
only showing top 2 rows



In [28]:
#Note, even the numbers are read as strings when the DF is created
orderDF.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [29]:
orderItemDF.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_item_order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- qty: integer (nullable = true)
 |-- product_cost: double (nullable = true)
 |-- order_subtotal: double (nullable = true)



In [13]:
help(orderDF.filter)

Help on method filter in module pyspark.sql.dataframe:

filter(condition: 'ColumnOrName') -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Filters rows using the given condition.
    
    :func:`where` is an alias for :func:`filter`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    condition : :class:`Column` or str
        a :class:`Column` of :class:`types.BooleanType`
        or a string of SQL expression.
    
    Examples
    --------
    >>> df.filter(df.age > 3).collect()
    [Row(age=5, name='Bob')]
    >>> df.where(df.age == 2).collect()
    [Row(age=2, name='Alice')]
    
    >>> df.filter("age > 3").collect()
    [Row(age=5, name='Bob')]
    >>> df.where("age = 2").collect()
    [Row(age=2, name='Alice')]



In [20]:
filterOrder = orderDF.filter(orderDF.order_status != "COMPLETE")
filterOrder.show(2)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 2 rows



In [81]:
orderDF.where('order_status IN ("COMPLETE","CLOSED")').show(2)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|      CLOSED|
|       3|2013-07-25 00:00:00|            12111|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 2 rows



In [86]:
orderDF.selectExpr('order_status ="COMPLETE"').alias('filtered').show(2)

+-------------------------+
|(order_status = COMPLETE)|
+-------------------------+
|                    false|
|                    false|
+-------------------------+
only showing top 2 rows



In [88]:
help(orderDF.order_status)

Help on Column in module pyspark.sql.column object:

class Column(builtins.object)
 |  Column(jc: py4j.java_gateway.JavaObject) -> None
 |  
 |  A column in a DataFrame.
 |  
 |  :class:`Column` instances can be created by::
 |  
 |      # 1. Select a column out of a DataFrame
 |  
 |      df.colName
 |      df["colName"]
 |  
 |      # 2. Create from an expression
 |      df.colName + 1
 |      1 / df.colName
 |  
 |  .. versionadded:: 1.3.0
 |  
 |  Methods defined here:
 |  
 |  __add__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __and__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __bool__ = __nonzero__(self) -> None
 |  
 |  __contains__(self, item: Any) -> None
 |      # container operators
 |  


In [87]:
orderDF.where((orderDF.order_status == 'COMPLETE').__or__(orderDF.order_status=='CLOSED')).show(2)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|      CLOSED|
|       3|2013-07-25 00:00:00|            12111|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 2 rows



In [17]:
#There is dateformat which needs to be understood
orderDF.where((orderDF.order_status == 'COMPLETE').__and__(orderDF.order_date.startswith('2013-12'))).show(2)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|   20919|2013-12-01 00:00:00|              383|    COMPLETE|
|   20922|2013-12-01 00:00:00|             9720|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 2 rows



In [21]:
orderDF.where((orderDF.order_status=='COMPLETE').__and__(orderDF.order_date.like("2014-01%"))).show(2)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|   25882|2014-01-01 00:00:00|             4598|    COMPLETE|
|   25888|2014-01-01 00:00:00|             6735|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 2 rows



In [31]:
orderItemDF.where(orderItemDF.order_subtotal != orderItemDF.qty * orderItemDF.product_cost).show(2)

+-------------+-------------------+----------+---+------------+--------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|
+-------------+-------------------+----------+---+------------+--------------+
|            3|                  2|       502|  5|       250.0|          50.0|
|            5|                  4|       897|  2|       49.98|         24.99|
+-------------+-------------------+----------+---+------------+--------------+
only showing top 2 rows



In [33]:
orderItemDF.filter(orderItemDF.order_subtotal != orderItemDF.qty * orderItemDF.product_cost).show(2)

+-------------+-------------------+----------+---+------------+--------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|
+-------------+-------------------+----------+---+------------+--------------+
|            3|                  2|       502|  5|       250.0|          50.0|
|            5|                  4|       897|  2|       49.98|         24.99|
+-------------+-------------------+----------+---+------------+--------------+
only showing top 2 rows



In [15]:
orderDF.selectExpr("CASE WHEN order_status IN ('COMPLETE','CLOSED') THEN 'COMPLETED' ELSE 'PENDING' END").show(2)

+------------------------------------------------------------------------------+
|CASE WHEN (order_status IN (COMPLETE, CLOSED)) THEN COMPLETED ELSE PENDING END|
+------------------------------------------------------------------------------+
|                                                                     COMPLETED|
|                                                                       PENDING|
+------------------------------------------------------------------------------+
only showing top 2 rows



In [21]:
help(orderDF.join)

Help on method join in module pyspark.sql.dataframe:

join(other: 'DataFrame', on: Union[str, List[str], pyspark.sql.column.Column, List[pyspark.sql.column.Column], NoneType] = None, how: Optional[str] = None) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Joins with another :class:`DataFrame`, using the given join expression.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    other : :class:`DataFrame`
        Right side of the join
    on : str, list or :class:`Column`, optional
        a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    how : str, optional
        default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``,

In [12]:
orderJoined = orderDF.join(orderItemDF, orderDF.order_id == orderItemDF.order_item_id, 'outer')

In [13]:
orderJoined.take(2)

                                                                                

[Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), order_customer_id=11599, order_status='CLOSED', order_item_id=1, order_item_order_id=1, product_id=957, qty=1, product_cost=299.98, order_subtotal=299.98),
 Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), order_customer_id=11599, order_status='CLOSED', order_item_id=1, order_item_order_id=1, product_id=957, qty=1, product_cost=299.98, order_subtotal=299.98)]

In [27]:
help(orderDF.groupBy)

Help on method groupBy in module pyspark.sql.dataframe:

groupBy(*cols: 'ColumnOrName') -> 'GroupedData' method of pyspark.sql.dataframe.DataFrame instance
    Groups the :class:`DataFrame` using the specified columns,
    so we can run aggregation on them. See :class:`GroupedData`
    for all the available aggregate functions.
    
    :func:`groupby` is an alias for :func:`groupBy`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : list, str or :class:`Column`
        columns to group by.
        Each element should be a column name (string) or an expression (:class:`Column`).
    
    Examples
    --------
    >>> df.groupBy().avg().collect()
    [Row(avg(age)=3.5)]
    >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(df.name).avg().collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(['name', df

In [58]:
orderJoinedGroup= orderJoined.groupBy('order_id').agg(sum(col("order_subtotal")).alias('order_total'))

In [59]:
orderJoinedGroup.show(2)

+--------+-----------+
|order_id|order_total|
+--------+-----------+
|     148|      99.99|
|     463|     399.98|
+--------+-----------+
only showing top 2 rows



In [35]:
help(orderJoinedGroup.sort)

Help on method sort in module pyspark.sql.dataframe:

sort(*cols: Union[str, pyspark.sql.column.Column, List[Union[str, pyspark.sql.column.Column]]], **kwargs: Any) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` sorted by the specified column(s).
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, list, or :class:`Column`, optional
         list of :class:`Column` or column names to sort by.
    
    Other Parameters
    ----------------
    ascending : bool or list, optional
        boolean or list of boolean (default ``True``).
        Sort ascending vs. descending. Specify list for multiple sort orders.
        If a list is specified, length of the list must equal length of the `cols`.
    
    Examples
    --------
    >>> df.sort(df.age.desc()).collect()
    [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
    >>> df.sort("age", ascending=False).collect()
    [Row(age=5, name='Bob'), Row(age=

In [61]:
orderJoinedGroup.sort(col("order_total").desc()).dropna().show(20)



+--------+-----------+
|order_id|order_total|
+--------+-----------+
|       1|     599.96|
|       9|     599.96|
|   18498|     399.98|
|   26623|     399.98|
|   26708|     399.98|
|   27760|     399.98|
|   38723|     399.98|
|   30361|     399.98|
|    1645|     399.98|
|   30970|     399.98|
|    3749|     399.98|
|   29719|     399.98|
|    6336|     399.98|
|   10206|     399.98|
|   30654|     399.98|
|   13840|     399.98|
|     463|     399.98|
|   31912|     399.98|
|    1829|     399.98|
|   32396|     399.98|
+--------+-----------+
only showing top 20 rows



                                                                                

In [62]:
help(orderDF.select)

Help on method select in module pyspark.sql.dataframe:

select(*cols: 'ColumnOrName') -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of expressions and returns a new :class:`DataFrame`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, :class:`Column`, or list
        column names (string) or expressions (:class:`Column`).
        If one of the column names is '*', that column is expanded to include all columns
        in the current :class:`DataFrame`.
    
    Examples
    --------
    >>> df.select('*').collect()
    [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
    >>> df.select('name', 'age').collect()
    [Row(name='Alice', age=2), Row(name='Bob', age=5)]
    >>> df.select(df.name, (df.age + 10).alias('age')).collect()
    [Row(name='Alice', age=12), Row(name='Bob', age=15)]



In [64]:
orderDF.select(col('order_id'),col('order_status')).show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [65]:
orderDF.select('order_id','order_status').show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [75]:
orderDF.select(orderDF.order_id, orderDF.order_date).show(2)

+--------+-------------------+
|order_id|         order_date|
+--------+-------------------+
|       1|2013-07-25 00:00:00|
|       2|2013-07-25 00:00:00|
+--------+-------------------+
only showing top 2 rows



In [37]:
orderDF.rdd.getNumPartitions()

2

In [40]:
orderDF.createTempView("sqlView")

In [41]:
#spark sql can work on only views, which are equivalent to the views 
spark.sql("SELECT order_status FROM sqlView").show(2)

+---------------+
|   order_status|
+---------------+
|         CLOSED|
|PENDING_PAYMENT|
+---------------+
only showing top 2 rows



In [44]:
orderDF.cube("order_status").count().show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|         CLOSED| 7558|
|PENDING_PAYMENT|15033|
|       COMPLETE|22903|
|           null|68893|
|        ON_HOLD| 3798|
|     PROCESSING| 8276|
| PAYMENT_REVIEW|  729|
|        PENDING| 7610|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
+---------------+-----+



In [45]:
help(pyspark.sql.functions)

Help on module pyspark.sql.functions in pyspark.sql:

NAME
    pyspark.sql.functions - A collections of builtin functions

FUNCTIONS
    abs(col: 'ColumnOrName') -> pyspark.sql.column.Column
        Computes the absolute value.
        
        .. versionadded:: 1.3
    
    acos(col: 'ColumnOrName') -> pyspark.sql.column.Column
        Computes inverse cosine of the input column.
        
        .. versionadded:: 1.4.0
        
        Returns
        -------
        :class:`~pyspark.sql.Column`
            inverse cosine of `col`, as if computed by `java.lang.Math.acos()`
    
    acosh(col: 'ColumnOrName') -> pyspark.sql.column.Column
        Computes inverse hyperbolic cosine of the input column.
        
        .. versionadded:: 3.1.0
        
        Returns
        -------
        :class:`~pyspark.sql.Column`
    
    add_months(start: 'ColumnOrName', months: Union[ForwardRef('ColumnOrName'), int]) -> pyspark.sql.column.Column
        Returns the date that is `months` months a

In [46]:
help(date_format)

Help on function date_format in module pyspark.sql.functions:

date_format(date: 'ColumnOrName', format: str) -> pyspark.sql.column.Column
    Converts a date/timestamp/string to a value of string in the format specified by the date
    format given by the second argument.
    
    A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    pattern letters of `datetime pattern`_. can be used.
    
    .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
    
    .. versionadded:: 1.5.0
    
    Notes
    -----
    Whenever possible, use specialized functions like `year`.
    
    Examples
    --------
    >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])
    >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()
    [Row(date='04/08/2015')]



In [50]:
orderDF.select(date_format('order_date','yyyyMM').alias("order_month")).show(2)

+-----------+
|order_month|
+-----------+
|     201307|
|     201307|
+-----------+
only showing top 2 rows



In [54]:
help(orderDF.withColumn)

Help on method withColumn in module pyspark.sql.dataframe:

withColumn(colName: str, col: pyspark.sql.column.Column) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` by adding a column or replacing the
    existing column that has the same name.
    
    The column expression must be an expression over this :class:`DataFrame`; attempting to add
    a column from some other :class:`DataFrame` will raise an error.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    colName : str
        string, name of the new column.
    col : :class:`Column`
        a :class:`Column` expression for the new column.
    
    Notes
    -----
    This method introduces a projection internally. Therefore, calling it multiple
    times, for instance, via loops in order to add multiple columns can generate big
    plans which can cause performance issues and even `StackOverflowException`.
    To avoid this, use :func:`select` with the m

In [65]:
help(orderDF.alias)

Help on method alias in module pyspark.sql.dataframe:

alias(alias: str) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` with an alias set.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    alias : str
        an alias name to be set for the :class:`DataFrame`.
    
    Examples
    --------
    >>> from pyspark.sql.functions import *
    >>> df_as1 = df.alias("df_as1")
    >>> df_as2 = df.alias("df_as2")
    >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
    >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age")                 .sort(desc("df_as1.name")).collect()
    [Row(name='Bob', name='Bob', age=5), Row(name='Alice', name='Alice', age=2)]



In [67]:
o = orderDF.alias('o')
oi = orderItemDF.alias('oi')

In [69]:
joinedOrders=o.where("order_status IN ('COMPLETE','CLOSED')").join(oi,o.order_id == oi.order_item_order_id)

In [71]:
joinedOrders.select('order_id','order_date','order_customer_id','order_item_id','product_id').show(2)

+--------+-------------------+-----------------+-------------+----------+
|order_id|         order_date|order_customer_id|order_item_id|product_id|
+--------+-------------------+-----------------+-------------+----------+
|       1|2013-07-25 00:00:00|            11599|            1|       957|
|       1|2013-07-25 00:00:00|            11599|            1|       957|
+--------+-------------------+-----------------+-------------+----------+
only showing top 2 rows



In [75]:
o.where("order_status IN ('COMPLETE','CLOSED')"). \
    join(oi,o.order_id == oi.order_item_order_id, 'left'). \
    where(orderItemDF.order_item_order_id.isNull()). \
    select('order_id','product_id','product_cost','order_subtotal','order_date').show(2)



+--------+----------+------------+--------------+-------------------+
|order_id|product_id|product_cost|order_subtotal|         order_date|
+--------+----------+------------+--------------+-------------------+
|       3|      null|        null|          null|2013-07-25 00:00:00|
|       6|      null|        null|          null|2013-07-25 00:00:00|
+--------+----------+------------+--------------+-------------------+
only showing top 2 rows



In [78]:
leftJoin = o.where("order_status IN ('COMPLETE','CLOSED')"). \
    join(oi,o.order_id == oi.order_item_order_id, 'left'). \
    where(orderItemDF.order_item_order_id.isNotNull()). \
    select('order_id','product_id','product_cost','order_subtotal','order_date')

In [81]:
o.where("order_status IN ('COMPLETE','CLOSED')"). \
    join(oi,o.order_id == oi.order_item_order_id, 'left'). \
    where(orderItemDF.order_item_order_id.isNull()). \
    select('order_id','product_id','product_cost','order_subtotal','order_date'). \
    where('order_id == 3').show()

+--------+----------+------------+--------------+-------------------+
|order_id|product_id|product_cost|order_subtotal|         order_date|
+--------+----------+------------+--------------+-------------------+
|       3|      null|        null|          null|2013-07-25 00:00:00|
|       3|      null|        null|          null|2013-07-25 00:00:00|
+--------+----------+------------+--------------+-------------------+



In [84]:
o.where('order_id = 6').show()

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       6|2013-07-25 00:00:00|             7130|    COMPLETE|
|       6|2013-07-25 00:00:00|             7130|    COMPLETE|
+--------+-------------------+-----------------+------------+



In [82]:
o.where("order_status IN ('COMPLETE','CLOSED')"). \
    join(oi,o.order_id == oi.order_item_order_id, 'left'). \
    where(orderItemDF.order_item_order_id.isNull()). \
    select('order_id','product_id','product_cost','order_subtotal','order_date'). \
    where('order_id == 6').show()

+--------+----------+------------+--------------+-------------------+
|order_id|product_id|product_cost|order_subtotal|         order_date|
+--------+----------+------------+--------------+-------------------+
|       6|      null|        null|          null|2013-07-25 00:00:00|
|       6|      null|        null|          null|2013-07-25 00:00:00|
+--------+----------+------------+--------------+-------------------+



In [89]:
leftJoin.groupBy('product_id').agg(round(sum('order_subtotal'),1).alias('product_revn')).show(2)

+----------+------------+
|product_id|product_revn|
+----------+------------+
|       897|      3473.6|
|       804|      2658.7|
+----------+------------+
only showing top 2 rows





In [106]:
orderReven = leftJoin.groupBy('order_id','product_id').agg(round(sum('order_subtotal'),1).alias('product_revn'))

In [107]:
orderReven.show(2)



+--------+----------+------------+
|order_id|product_id|product_revn|
+--------+----------+------------+
|     236|       365|        60.0|
|     256|       365|       120.0|
+--------+----------+------------+
only showing top 2 rows



                                                                                

In [93]:
productRevn = leftJoin.groupBy('product_id','order_date').agg(round(sum('order_subtotal'),1).alias('product_revn'))

In [99]:
productRevn.orderBy(col('product_revn').desc()).show()



+----------+-------------------+------------+
|product_id|         order_date|product_revn|
+----------+-------------------+------------+
|      1004|2014-03-04 00:00:00|     16399.2|
|      1004|2013-12-06 00:00:00|     15999.2|
|      1004|2014-04-08 00:00:00|     15999.2|
|      1004|2013-11-03 00:00:00|     15599.2|
|      1004|2013-11-30 00:00:00|     15599.2|
|      1004|2013-08-17 00:00:00|     15199.2|
|      1004|2014-01-30 00:00:00|     14799.3|
|      1004|2014-07-20 00:00:00|     14799.3|
|      1004|2013-12-11 00:00:00|     14799.3|
|      1004|2013-09-05 00:00:00|     14399.3|
|      1004|2014-05-12 00:00:00|     13999.3|
|      1004|2013-12-22 00:00:00|     13999.3|
|      1004|2014-02-06 00:00:00|     13999.3|
|      1004|2013-11-07 00:00:00|     13599.3|
|      1004|2014-04-02 00:00:00|     13599.3|
|      1004|2014-06-19 00:00:00|     13599.3|
|      1004|2014-01-11 00:00:00|     13599.3|
|      1004|2013-10-06 00:00:00|     13599.3|
|      1004|2014-05-16 00:00:00|  

                                                                                

In [110]:
productRevn.orderBy(col('product_id').desc(),col('order_date').desc()).show()

+----------+-------------------+------------+
|product_id|         order_date|product_revn|
+----------+-------------------+------------+
|      1073|2014-07-24 00:00:00|      4199.8|
|      1073|2014-07-23 00:00:00|      2599.9|
|      1073|2014-07-22 00:00:00|      4399.8|
|      1073|2014-07-21 00:00:00|      7799.6|
|      1073|2014-07-20 00:00:00|      5799.7|
|      1073|2014-07-19 00:00:00|      4399.8|
|      1073|2014-07-18 00:00:00|      2999.8|
|      1073|2014-07-17 00:00:00|      2799.9|
|      1073|2014-07-16 00:00:00|      4399.8|
|      1073|2014-07-15 00:00:00|      6799.7|
|      1073|2014-07-14 00:00:00|      2999.8|
|      1073|2014-07-13 00:00:00|      3399.8|
|      1073|2014-07-12 00:00:00|      3599.8|
|      1073|2014-07-11 00:00:00|      2399.9|
|      1073|2014-07-10 00:00:00|      5599.7|
|      1073|2014-07-09 00:00:00|      3799.8|
|      1073|2014-07-08 00:00:00|      4799.8|
|      1073|2014-07-07 00:00:00|      2799.9|
|      1073|2014-07-06 00:00:00|  

In [108]:
orderReven.orderBy(col('product_revn').desc(),col('order_id')).show()

+--------+----------+------------+
|order_id|product_id|product_revn|
+--------+----------+------------+
|   68703|       208|      2000.0|
|   68724|       208|      2000.0|
|   68736|       208|      2000.0|
|   68778|       208|      2000.0|
|   68806|       208|      2000.0|
|   68821|       208|      2000.0|
|   68837|       208|      2000.0|
|   68848|       208|      2000.0|
|   68858|       208|      2000.0|
|   68859|       208|      2000.0|
|   68883|       208|      2000.0|
|    9084|      1004|      1599.9|
|   11105|      1004|      1599.9|
|   14539|      1004|      1599.9|
|   30299|      1004|      1599.9|
|   44891|      1004|      1599.9|
|       5|       957|      1199.9|
|     730|      1004|      1199.9|
|    2760|      1004|      1199.9|
|    3531|       957|      1199.9|
+--------+----------+------------+
only showing top 20 rows



In [102]:
o.groupBy('order_status').agg(count('order_status').alias('stat_count')).show(2)

+---------------+----------+
|   order_status|stat_count|
+---------------+----------+
|PENDING_PAYMENT|     15033|
|       COMPLETE|     22903|
+---------------+----------+
only showing top 2 rows



In [103]:
o.groupBy('order_status').agg(count('order_status').alias('stat_count')). \
    orderBy(col('stat_count').desc()).show()

+---------------+----------+
|   order_status|stat_count|
+---------------+----------+
|       COMPLETE|     22903|
|PENDING_PAYMENT|     15033|
|     PROCESSING|      8276|
|        PENDING|      7610|
|         CLOSED|      7558|
|        ON_HOLD|      3798|
|SUSPECTED_FRAUD|      1558|
|       CANCELED|      1428|
| PAYMENT_REVIEW|       729|
+---------------+----------+



In [111]:
productRevn.write.csv('product_revenue.csv')

[Stage 130:>                                                        (0 + 1) / 1]                                                                                