#### This notebook is about the Transformation to be done on DFs

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

In [41]:
employees = [
    (1, "Scott", "Tiger", 1000.0, 
      "united states", "+1 123 456 7890", "123 45 6789"
    ),
     (2, "Henry", "Ford", 1250.0, 
      "India", "+91 234 567 8901", "456 78 9123"
     ),
     (3, "Nick", "Junior", 750.0, 
      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
     ),
     (4, "Bill", "Gomes", 1500.0, 
      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
     )
]

In [3]:
productPath = "/home/solverbot/spark-warehouse/retail_db/products/part-00000"
orderitemPath = "/home/solverbot/spark-warehouse/retail_db/order_items/part-00000"
ordersPath = "/home/solverbot/spark-warehouse/retail_db/orders/part-00000.txt"

In [4]:
#What is the difference between Session and Context?
spark = SparkSession.builder.appName('DF transformations').getOrCreate()

22/11/21 20:25:17 WARN Utils: Your hostname, codeStation resolves to a loopback address: 127.0.1.1; using 192.168.102.83 instead (on interface wlo1)
22/11/21 20:25:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/21 20:25:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [42]:
employeesDF = spark. \
    createDataFrame(employees,
                    schema="""employee_id INT, first_name STRING, 
                    last_name STRING, salary FLOAT, nationality STRING,
                    phone_number STRING, ssn STRING"""
                   )

In [44]:
from pyspark.sql.functions import *

In [46]:
employeesDF. \
    groupBy(upper(col("nationality")).alias('upper_nationality')). \
    count(). \
    show()

+-----------------+-----+
|upper_nationality|count|
+-----------------+-----+
|    UNITED STATES|    1|
|            INDIA|    1|
|   UNITED KINGDOM|    1|
|        AUSTRALIA|    1|
+-----------------+-----+



In [6]:
orderItemDF = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/order_items/",inferSchema=True).toDF("order_item_id","order_item_order_id","product_id", "qty","product_cost","order_subtotal")



[Row(order_item_id=1, order_item_order_id=1, product_id=957, qty=1, product_cost=299.98, order_subtotal=299.98),
 Row(order_item_id=2, order_item_order_id=2, product_id=1073, qty=1, product_cost=199.99, order_subtotal=199.99)]

In [8]:
orderDF = spark.read.csv("/home/solverbot/spark-warehouse/retail_db/orders",inferSchema=True) \
            .toDF("order_id","order_date","order_customer_id","order_status")

In [9]:
orderDF.show(2)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 2 rows



In [10]:
orderItemDF.show(2)

+-------------+-------------------+----------+---+------------+--------------+
|order_item_id|order_item_order_id|product_id|qty|product_cost|order_subtotal|
+-------------+-------------------+----------+---+------------+--------------+
|            1|                  1|       957|  1|      299.98|        299.98|
|            2|                  2|      1073|  1|      199.99|        199.99|
+-------------+-------------------+----------+---+------------+--------------+
only showing top 2 rows



In [11]:
#Note, even the numbers are read as strings when the DF is created
orderDF.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
orderItemDF.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_item_order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- qty: integer (nullable = true)
 |-- product_cost: double (nullable = true)
 |-- order_subtotal: double (nullable = true)



In [13]:
help(orderDF.filter)

Help on method filter in module pyspark.sql.dataframe:

filter(condition: 'ColumnOrName') -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Filters rows using the given condition.
    
    :func:`where` is an alias for :func:`filter`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    condition : :class:`Column` or str
        a :class:`Column` of :class:`types.BooleanType`
        or a string of SQL expression.
    
    Examples
    --------
    >>> df.filter(df.age > 3).collect()
    [Row(age=5, name='Bob')]
    >>> df.where(df.age == 2).collect()
    [Row(age=2, name='Alice')]
    
    >>> df.filter("age > 3").collect()
    [Row(age=5, name='Bob')]
    >>> df.where("age = 2").collect()
    [Row(age=2, name='Alice')]



In [20]:
filterOrder = orderDF.filter(orderDF.order_status != "COMPLETE")
filterOrder.show(2)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 2 rows



In [81]:
orderDF.where('order_status IN ("COMPLETE","CLOSED")').show(2)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|      CLOSED|
|       3|2013-07-25 00:00:00|            12111|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 2 rows



In [86]:
orderDF.selectExpr('order_status ="COMPLETE"').alias('filtered').show(2)

+-------------------------+
|(order_status = COMPLETE)|
+-------------------------+
|                    false|
|                    false|
+-------------------------+
only showing top 2 rows



In [88]:
help(orderDF.order_status)

Help on Column in module pyspark.sql.column object:

class Column(builtins.object)
 |  Column(jc: py4j.java_gateway.JavaObject) -> None
 |  
 |  A column in a DataFrame.
 |  
 |  :class:`Column` instances can be created by::
 |  
 |      # 1. Select a column out of a DataFrame
 |  
 |      df.colName
 |      df["colName"]
 |  
 |      # 2. Create from an expression
 |      df.colName + 1
 |      1 / df.colName
 |  
 |  .. versionadded:: 1.3.0
 |  
 |  Methods defined here:
 |  
 |  __add__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __and__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __bool__ = __nonzero__(self) -> None
 |  
 |  __contains__(self, item: Any) -> None
 |      # container operators
 |  


In [87]:
orderDF.where((orderDF.order_status == 'COMPLETE').__or__(orderDF.order_status=='CLOSED')).show(2)

+--------+-------------------+-----------------+------------+
|order_id|         order_date|order_customer_id|order_status|
+--------+-------------------+-----------------+------------+
|       1|2013-07-25 00:00:00|            11599|      CLOSED|
|       3|2013-07-25 00:00:00|            12111|    COMPLETE|
+--------+-------------------+-----------------+------------+
only showing top 2 rows



In [21]:
help(orderDF.join)

Help on method join in module pyspark.sql.dataframe:

join(other: 'DataFrame', on: Union[str, List[str], pyspark.sql.column.Column, List[pyspark.sql.column.Column], NoneType] = None, how: Optional[str] = None) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Joins with another :class:`DataFrame`, using the given join expression.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    other : :class:`DataFrame`
        Right side of the join
    on : str, list or :class:`Column`, optional
        a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    how : str, optional
        default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``,

In [25]:
orderJoined = orderDF.join(orderItemDF, orderDF.order_id == orderItemDF.order_item_id, 'outer')

In [26]:
orderJoined.take(2)

                                                                                

[Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), order_customer_id=11599, order_status='CLOSED', order_item_id=1, order_item_order_id=1, product_id=957, qty=1, product_cost=299.98, order_subtotal=299.98),
 Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), order_customer_id=11599, order_status='CLOSED', order_item_id=1, order_item_order_id=1, product_id=957, qty=1, product_cost=299.98, order_subtotal=299.98)]

In [27]:
help(orderDF.groupBy)

Help on method groupBy in module pyspark.sql.dataframe:

groupBy(*cols: 'ColumnOrName') -> 'GroupedData' method of pyspark.sql.dataframe.DataFrame instance
    Groups the :class:`DataFrame` using the specified columns,
    so we can run aggregation on them. See :class:`GroupedData`
    for all the available aggregate functions.
    
    :func:`groupby` is an alias for :func:`groupBy`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : list, str or :class:`Column`
        columns to group by.
        Each element should be a column name (string) or an expression (:class:`Column`).
    
    Examples
    --------
    >>> df.groupBy().avg().collect()
    [Row(avg(age)=3.5)]
    >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(df.name).avg().collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(['name', df

In [58]:
orderJoinedGroup= orderJoined.groupBy('order_id').agg(sum(col("order_subtotal")).alias('order_total'))

In [59]:
orderJoinedGroup.show(2)

+--------+-----------+
|order_id|order_total|
+--------+-----------+
|     148|      99.99|
|     463|     399.98|
+--------+-----------+
only showing top 2 rows



In [35]:
help(orderJoinedGroup.sort)

Help on method sort in module pyspark.sql.dataframe:

sort(*cols: Union[str, pyspark.sql.column.Column, List[Union[str, pyspark.sql.column.Column]]], **kwargs: Any) -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Returns a new :class:`DataFrame` sorted by the specified column(s).
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, list, or :class:`Column`, optional
         list of :class:`Column` or column names to sort by.
    
    Other Parameters
    ----------------
    ascending : bool or list, optional
        boolean or list of boolean (default ``True``).
        Sort ascending vs. descending. Specify list for multiple sort orders.
        If a list is specified, length of the list must equal length of the `cols`.
    
    Examples
    --------
    >>> df.sort(df.age.desc()).collect()
    [Row(age=5, name='Bob'), Row(age=2, name='Alice')]
    >>> df.sort("age", ascending=False).collect()
    [Row(age=5, name='Bob'), Row(age=

In [61]:
orderJoinedGroup.sort(col("order_total").desc()).dropna().show(20)



+--------+-----------+
|order_id|order_total|
+--------+-----------+
|       1|     599.96|
|       9|     599.96|
|   18498|     399.98|
|   26623|     399.98|
|   26708|     399.98|
|   27760|     399.98|
|   38723|     399.98|
|   30361|     399.98|
|    1645|     399.98|
|   30970|     399.98|
|    3749|     399.98|
|   29719|     399.98|
|    6336|     399.98|
|   10206|     399.98|
|   30654|     399.98|
|   13840|     399.98|
|     463|     399.98|
|   31912|     399.98|
|    1829|     399.98|
|   32396|     399.98|
+--------+-----------+
only showing top 20 rows



                                                                                

In [62]:
help(orderDF.select)

Help on method select in module pyspark.sql.dataframe:

select(*cols: 'ColumnOrName') -> 'DataFrame' method of pyspark.sql.dataframe.DataFrame instance
    Projects a set of expressions and returns a new :class:`DataFrame`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : str, :class:`Column`, or list
        column names (string) or expressions (:class:`Column`).
        If one of the column names is '*', that column is expanded to include all columns
        in the current :class:`DataFrame`.
    
    Examples
    --------
    >>> df.select('*').collect()
    [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
    >>> df.select('name', 'age').collect()
    [Row(name='Alice', age=2), Row(name='Bob', age=5)]
    >>> df.select(df.name, (df.age + 10).alias('age')).collect()
    [Row(name='Alice', age=12), Row(name='Bob', age=15)]



In [64]:
orderDF.select(col('order_id'),col('order_status')).show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [65]:
orderDF.select('order_id','order_status').show(2)

+--------+---------------+
|order_id|   order_status|
+--------+---------------+
|       1|         CLOSED|
|       2|PENDING_PAYMENT|
+--------+---------------+
only showing top 2 rows



In [75]:
orderDF.select(orderDF.order_id, orderDF.order_date).show(2)

+--------+-------------------+
|order_id|         order_date|
+--------+-------------------+
|       1|2013-07-25 00:00:00|
|       2|2013-07-25 00:00:00|
+--------+-------------------+
only showing top 2 rows



In [71]:
help(date_format())

Help on Column in module pyspark.sql.column object:

class Column(builtins.object)
 |  Column(jc: py4j.java_gateway.JavaObject) -> None
 |  
 |  A column in a DataFrame.
 |  
 |  :class:`Column` instances can be created by::
 |  
 |      # 1. Select a column out of a DataFrame
 |  
 |      df.colName
 |      df["colName"]
 |  
 |      # 2. Create from an expression
 |      df.colName + 1
 |      1 / df.colName
 |  
 |  .. versionadded:: 1.3.0
 |  
 |  Methods defined here:
 |  
 |  __add__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __and__ = _(self: 'Column', other: Union[ForwardRef('Column'), ForwardRef('LiteralType'), ForwardRef('DecimalLiteral'), ForwardRef('DateTimeLiteral')]) -> 'Column'
 |      binary operator
 |  
 |  __bool__ = __nonzero__(self) -> None
 |  
 |  __contains__(self, item: Any) -> None
 |      # container operators
 |  


In [79]:
orderDF.selectExpr("date_format(order_date, 'YYYYMM') AS order_month").show(2)

SparkUpgradeException: You may get a different result due to the upgrading to Spark >= 3.0: Fail to recognize 'YYYYMM' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html

In [9]:
type(orders_df)

pyspark.sql.dataframe.DataFrame

In [10]:
orders_df.take(2)

[Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), order_customer_id=11599, order_status='CLOSED'),
 Row(order_id=2, order_date=datetime.datetime(2013, 7, 25, 0, 0), order_customer_id=256, order_status='PENDING_PAYMENT')]

In [11]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
orders_df.show(2)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
+--------+-------------------+-----------------+---------------+
only showing top 2 rows



In [36]:
url = "jdbc:postgresql://localhost/"

In [32]:
help(sparkSQL.read.jdbc)

Help on method jdbc in module pyspark.sql.readwriter:

jdbc(url: str, table: str, column: Optional[str] = None, lowerBound: Union[str, int, NoneType] = None, upperBound: Union[str, int, NoneType] = None, numPartitions: Optional[int] = None, predicates: Optional[List[str]] = None, properties: Optional[Dict[str, str]] = None) -> 'DataFrame' method of pyspark.sql.readwriter.DataFrameReader instance
    Construct a :class:`DataFrame` representing the database table named ``table``
    accessible via JDBC URL ``url`` and connection ``properties``.
    
    Partitions of the table will be retrieved in parallel if either ``column`` or
    ``predicates`` is specified. ``lowerBound``, ``upperBound`` and ``numPartitions``
    is needed when ``column`` is specified.
    
    If both ``column`` and ``predicates`` are specified, ``column`` will be used.
    
    .. versionadded:: 1.4.0
    
    Parameters
    ----------
    table : str
        the name of the table
    column : str, optional
      

In [39]:
orders_df.write.csv('ordersdb')

                                                                                

In [44]:
#Remember to give the alias to the table that is created with select
ordersQuery = sparkSQL.read.format('jdbc') \
        .option('url',"jdbc:postgresql://localhost/postgres") \
        .option('dbtable','(SELECT * FROM orders LIMIT 5) q') \
        .option('user','postgres') \
        .option('password',1234) \
        .option("driver", "org.postgresql.Driver") \
        .load()

In [45]:
ordersQuery.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+



In [46]:
import pyspark.sql.functions as sf

In [48]:
from pyspark.sql.functions import *

In [49]:
orderitems= sparkSQL.read.format('jdbc') \
        .option('url',"jdbc:postgresql://localhost/postgres") \
        .option('dbtable','order_items') \
        .option('user','postgres') \
        .option('password',1234) \
        .option("driver", "org.postgresql.Driver") \
        .load()

In [51]:
orderitems.printSchema()

root
 |-- order_item_id: integer (nullable = true)
 |-- order_item_order_id: integer (nullable = true)
 |-- order_item_product_id: integer (nullable = true)
 |-- order_item_quantity: integer (nullable = true)
 |-- order_item_subtotal: double (nullable = true)
 |-- order_item_product_price: double (nullable = true)



In [54]:
orderitems.select("order_item_quantity", "order_item_subtotal").show()

+-------------------+-------------------+
|order_item_quantity|order_item_subtotal|
+-------------------+-------------------+
|                  1|             299.98|
|                  1|             199.99|
|                  5|              250.0|
|                  1|             129.99|
|                  2|              49.98|
|                  5|             299.95|
|                  3|              150.0|
|                  4|             199.92|
|                  1|             299.98|
|                  5|             299.95|
|                  2|              99.96|
|                  1|             299.98|
|                  1|             129.99|
|                  1|             199.99|
|                  1|             299.98|
|                  5|              79.95|
|                  3|             179.97|
|                  5|             299.95|
|                  4|             199.92|
|                  1|               50.0|
+-------------------+-------------

In [55]:
productTable = sparkSQL.read.format('jdbc') \
        .option('url',"jdbc:postgresql://localhost/postgres") \
        .option('dbtable','products') \
        .option('user','postgres') \
        .option('password',1234) \
        .option("driver", "org.postgresql.Driver") \
        .load()

In [56]:
productTable.show(2)

+----------+-------------------+--------------------+-------------------+-------------+--------------------+
|product_id|product_category_id|        product_name|product_description|product_price|       product_image|
+----------+-------------------+--------------------+-------------------+-------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|                   |        59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|                   |       129.99|http://images.acm...|
+----------+-------------------+--------------------+-------------------+-------------+--------------------+
only showing top 2 rows



In [61]:
orders_df.selectExpr("CASE WHEN order_status IN ('COMPLETE','CLOSED') THEN 'COMPLETED' ELSE 'PENDING' END").show(2)

+------------------------------------------------------------------------------+
|CASE WHEN (order_status IN (COMPLETE, CLOSED)) THEN COMPLETED ELSE PENDING END|
+------------------------------------------------------------------------------+
|                                                                     COMPLETED|
|                                                                       PENDING|
+------------------------------------------------------------------------------+
only showing top 2 rows

