In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [1]:

import findspark
findspark.init()
import pyspark

In [5]:
import  pyspark.sql.functions as F

In [6]:
file_location = "2010_12_01-1.csv"

In [12]:
df = spark.read.format("csv")\
  .option("header", "true")\
  .option("inferSchema", "true")\
  .load("2010-12-01.csv")

#df.cache()
df.createOrReplaceTempView("dfTable")
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

In [0]:
df.selectExpr('max(InvoiceDate)', 'min(InvoiceDate)').show()

+-------------------+-------------------+
|   max(InvoiceDate)|   min(InvoiceDate)|
+-------------------+-------------------+
|2010-12-01 17:35:00|2010-12-01 08:26:00|
+-------------------+-------------------+



In [0]:
df.select(F.count('InvoiceNo'), F.countDistinct('InvoiceNo')).show()

+----------------+-------------------------+
|count(InvoiceNo)|count(DISTINCT InvoiceNo)|
+----------------+-------------------------+
|            3108|                      143|
+----------------+-------------------------+



In [0]:
df.select(F.count('CustomerID'), F.countDistinct('CustomerID')).show()

+-----------------+--------------------------+
|count(CustomerID)|count(DISTINCT CustomerID)|
+-----------------+--------------------------+
|             1968|                        98|
+-----------------+--------------------------+



In [13]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [14]:
df.count()

3108

In [15]:
for thecol in df.columns:
  print('The count of null rows in column %s' %thecol, df.where(F.col(thecol).isNull()).count())

The count of null rows in column InvoiceNo 0
The count of null rows in column StockCode 0
The count of null rows in column Description 10
The count of null rows in column Quantity 0
The count of null rows in column InvoiceDate 0
The count of null rows in column UnitPrice 0
The count of null rows in column CustomerID 1140
The count of null rows in column Country 0


# Grouping Types

- **group by**:
    - one or more keys 
    - one or more aggregation functions to transform the value columns

- **window**:
    - one or more keys 
    - one or more aggregation functions to transform the value columns    
    - rows input to the function are related to the current row.

- **grouping set** w
    - Aggregate at multiple different levels. 
    - Primitive in SQL 
    - Rollups and Cubes in DataFrames.
        - **rollup** 
            - one or more keys 
            - one or more aggregation 
            - summarized hierarchically.
        - **cube** 
            - summarized across all combinations of columns.
    - Each grouping returns a RelationalGroupedDataset on which we specify our aggregations.
    
    | Tables   |      Are      |  Cool |
|----------|:-------------:|------:|
| col 1 is |  left-aligned | $1600 |
| col 2 is |    centered   |   $12 |
| col 3 is | right-aligned |    $1 |

- without grouping an aggregation functions acts on **all** the values in a single column
- with aggregation an aggretion acts a subset of the values of a column that correspond to a **group** in another column

In [16]:
groups = df.groupBy("InvoiceNo")
print (groups)

GroupedData[grouping expressions: [InvoiceNo], value: [InvoiceNo: string, StockCode: string ... 6 more fields], type: GroupBy]


In [17]:
df.groupBy("InvoiceNo", "CustomerId").count().limit(5).show()

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536596|      NULL|    6|
|   536530|   17905.0|   23|
|   536414|      NULL|    1|
|   536400|   13448.0|    1|
|   536550|      NULL|    1|
+---------+----------+-----+



In [18]:
df.filter("InvoiceNo ='536596'").show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536596|    21624|VINTAGE UNION JAC...|       1|2010-12-01 17:29:00|     5.95|      NULL|United Kingdom|
|   536596|    22900| SET 2 TEA TOWELS...|       1|2010-12-01 17:29:00|     2.95|      NULL|United Kingdom|
|   536596|    22114|HOT WATER BOTTLE ...|       1|2010-12-01 17:29:00|     3.95|      NULL|United Kingdom|
|   536596|    21967|PACK OF 12 SKULL ...|       1|2010-12-01 17:29:00|     0.29|      NULL|United Kingdom|
|   536596|   84926A|WAKE UP COCKEREL ...|       4|2010-12-01 17:29:00|     1.25|      NULL|United Kingdom|
|   536596|    22802|FAUX FUR CHOCOLAT...|       1|2010-12-01 17:29:00|    19.95|      NULL|United Kingdom|
+---------+---------+-------

## Grouping with Col/Expressions using `agg`

In [19]:
from pyspark.sql.functions import count,expr
df.groupBy("InvoiceNo").\
    agg(
     count("Quantity").alias("quan"),
     expr("count(Quantity)")
   ).limit(5).toPandas()

Unnamed: 0,InvoiceNo,quan,count(Quantity)
0,536596,6,6
1,536597,28,28
2,536414,1,1
3,536550,1,1
4,536460,14,14


## breaking up aggregation for readability
```SQL
SELECT avg(Quantity), stddev_pop(Quantity), InvoiceNo FROM dfTable
GROUP BY InvoiceNo
```

In [20]:
from pyspark.sql.functions import stddev_pop
avg    = expr("avg(Quantity)").alias("The Mean")
stddev = stddev_pop(df.Quantity).alias("The Standard Deviation")

dfGrouped = df.groupBy("InvoiceNo").agg(avg, stddev)

print([type(dfGrouped),type(avg), type(stddev)])
dfGrouped.limit(5).toPandas()

[<class 'pyspark.sql.dataframe.DataFrame'>, <class 'pyspark.sql.column.Column'>, <class 'pyspark.sql.column.Column'>]


Unnamed: 0,InvoiceNo,The Mean,The Standard Deviation
0,536596,1.5,1.118034
1,536597,2.535714,2.744893
2,536414,56.0,0.0
3,536550,1.0,0.0
4,536460,11.285714,8.802829


In [21]:
df.groupBy('InvoiceNo').agg(F.min('Quantity').alias('min_Quantity'),\
                            F.max('Quantity').alias('max_Quantity'),\
                            F.count('Quantity').alias('count_Quantity'),\
                            F.avg('Quantity').alias('avg_Quantity'),\
                            F.expr('percentile(Quantity, array(0.25))')[0].alias('Quantity_ptile_25'),\
                            F.expr('percentile(Quantity, array(0.75))')[0].alias('Quantity_ptile_75')).take(5)


[Row(InvoiceNo='536365', min_Quantity=2, max_Quantity=8, count_Quantity=7, avg_Quantity=5.714285714285714, Quantity_ptile_25=6.0, Quantity_ptile_75=6.0),
 Row(InvoiceNo='536366', min_Quantity=6, max_Quantity=6, count_Quantity=2, avg_Quantity=6.0, Quantity_ptile_25=6.0, Quantity_ptile_75=6.0),
 Row(InvoiceNo='536367', min_Quantity=2, max_Quantity=32, count_Quantity=12, avg_Quantity=6.916666666666667, Quantity_ptile_25=3.0, Quantity_ptile_75=6.0),
 Row(InvoiceNo='536368', min_Quantity=3, max_Quantity=6, count_Quantity=4, avg_Quantity=3.75, Quantity_ptile_25=3.0, Quantity_ptile_75=3.75),
 Row(InvoiceNo='536369', min_Quantity=3, max_Quantity=3, count_Quantity=1, avg_Quantity=3.0, Quantity_ptile_25=3.0, Quantity_ptile_75=3.0)]

## Grouping with Maps

In [22]:
from pyspark.sql.functions import stddev_pop
dfGrouped = df.groupBy("InvoiceNo").agg({"Quantity":"stddev"})
dfGrouped.limit(5).toPandas()

Unnamed: 0,InvoiceNo,stddev(Quantity)
0,536596,1.224745
1,536597,2.795262
2,536414,
3,536550,
4,536460,9.135127


# Window Functions 
- In normal grouping by a column
    - for each group,  `agg` applies aggergations on **all values in a group*
- In Window grouping
    - for each group, for each row in the group:
        - we apply  an aggegation on **a subest of values* related to current row

## Toy Example
- first we need a window specification which include:
    * **PARITION**:  column(s) to group by
    * **ORDER**: column used for sorting withtin the partition
    * **RowsBetween/RangeBetween**: window boundaries by position or value relative to current row
- second, we need to user the **over** method over a cloumn to use the windows specification

In [23]:
data = [("Banha",2019,400),("Banha",2018,200),("Cairo",2018,250), ("Aswan",2018,300),("Banha",2016,200),
         ("Banha",2015,0),("Aswan",2015,600),("Banha",2017,1500),("Cairo",2017,800),("Aswan",2016,50)]
header = ("city", "year", "sales")
dfA = spark.createDataFrame(data,header)
dfA.toPandas()

Unnamed: 0,city,year,sales
0,Banha,2019,400
1,Banha,2018,200
2,Cairo,2018,250
3,Aswan,2018,300
4,Banha,2016,200
5,Banha,2015,0
6,Aswan,2015,600
7,Banha,2017,1500
8,Cairo,2017,800
9,Aswan,2016,50


## e.g. Cumulative Sales

In [24]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc,max,col,sum,rank,asc

windowSpec = Window\
  .partitionBy("city")\
  .orderBy(asc("year"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

dfA.withColumn("cumSales", sum("sales").over(windowSpec)).toPandas() #.orderBy("city", "year").toPandas()

Unnamed: 0,city,year,sales,cumSales
0,Aswan,2015,600,600
1,Aswan,2016,50,650
2,Aswan,2018,300,950
3,Banha,2015,0,0
4,Banha,2016,200,200
5,Banha,2017,1500,1700
6,Banha,2018,200,1900
7,Banha,2019,400,2300
8,Cairo,2017,800,800
9,Cairo,2018,250,1050


### Clarifying Window boundaries: `rowsBetween`

In [25]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc,max,col,sum,rank,asc,collect_list

#1- Window spec
windowSpec = Window\
  .partitionBy("city")\
  .orderBy(asc("year")) #this is the default if you did not specify rowsBetween

#2- aggregation over 
L = collect_list("sales")
col  = L.over(windowSpec)# default: .rowsBetween(Window.unboundedPreceding, Window.currentRow)
col1 = L.over(windowSpec.rowsBetween(Window.currentRow, Window.unboundedFollowing))
col2 = L.over(windowSpec.rowsBetween(-1,Window.currentRow))
col3 = L.over(windowSpec.rowsBetween(Window.currentRow,2))
dfA.show()
dfA.withColumn("allPreceding-->cur", col)\
   .withColumn("cur-->allFollowing", col1)\
   .withColumn("1Previous-->cur", col2)\
   .withColumn("cur-->2Following", col3)\
   .orderBy("city", "year").toPandas()

+-----+----+-----+
| city|year|sales|
+-----+----+-----+
|Banha|2019|  400|
|Banha|2018|  200|
|Cairo|2018|  250|
|Aswan|2018|  300|
|Banha|2016|  200|
|Banha|2015|    0|
|Aswan|2015|  600|
|Banha|2017| 1500|
|Cairo|2017|  800|
|Aswan|2016|   50|
+-----+----+-----+



Unnamed: 0,city,year,sales,allPreceding-->cur,cur-->allFollowing,1Previous-->cur,cur-->2Following
0,Aswan,2015,600,[600],"[600, 50, 300]",[600],"[600, 50, 300]"
1,Aswan,2016,50,"[600, 50]","[50, 300]","[600, 50]","[50, 300]"
2,Aswan,2018,300,"[600, 50, 300]",[300],"[50, 300]",[300]
3,Banha,2015,0,[0],"[0, 200, 1500, 200, 400]",[0],"[0, 200, 1500]"
4,Banha,2016,200,"[0, 200]","[200, 1500, 200, 400]","[0, 200]","[200, 1500, 200]"
5,Banha,2017,1500,"[0, 200, 1500]","[1500, 200, 400]","[200, 1500]","[1500, 200, 400]"
6,Banha,2018,200,"[0, 200, 1500, 200]","[200, 400]","[1500, 200]","[200, 400]"
7,Banha,2019,400,"[0, 200, 1500, 200, 400]",[400],"[200, 400]",[400]
8,Cairo,2017,800,[800],"[800, 250]",[800],"[800, 250]"
9,Cairo,2018,250,"[800, 250]",[250],"[800, 250]",[250]


### Window boundaries: `rangeBetween`

In [26]:
data = [("Banha",2019,400),("Banha",2018,250),("Cairo",2018,250), ("Aswan",2018,300),("Banha",2016,200),
         ("Banha",2015,0),("Aswan",2015,600),("Banha",2017,1500),("Cairo",2017,800),("Aswan",2016,50)]
header = ("city", "year", "sales")
dfA = spark.createDataFrame(data,header)
dfA.toPandas()

Unnamed: 0,city,year,sales
0,Banha,2019,400
1,Banha,2018,250
2,Cairo,2018,250
3,Aswan,2018,300
4,Banha,2016,200
5,Banha,2015,0
6,Aswan,2015,600
7,Banha,2017,1500
8,Cairo,2017,800
9,Aswan,2016,50


In [27]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc,max,col,sum,rank,asc,collect_list


#1- Window spec
windowSpec = Window\
  .partitionBy("city")\
  .orderBy(asc("sales")) # Take care to sort by col in RangeBetween

#2- aggregation over 
L = collect_list("sales")
col  = L.over(windowSpec.rangeBetween(0,200))
col1 = L.over(windowSpec.rangeBetween(-200,0))
col2 = L.over(windowSpec.rangeBetween(-200,200))
dfA.show()
dfA.withColumn("0-->200", col)\
   .withColumn("cur-->allprecedding", col1)\
   .withColumn("Previous-->following", col2)\
   .sortWithinPartitions("sales").toPandas()

+-----+----+-----+
| city|year|sales|
+-----+----+-----+
|Banha|2019|  400|
|Banha|2018|  250|
|Cairo|2018|  250|
|Aswan|2018|  300|
|Banha|2016|  200|
|Banha|2015|    0|
|Aswan|2015|  600|
|Banha|2017| 1500|
|Cairo|2017|  800|
|Aswan|2016|   50|
+-----+----+-----+



Unnamed: 0,city,year,sales,0-->200,cur-->allprecedding,Previous-->following
0,Banha,2015,0,"[0, 200]",[0],"[0, 200]"
1,Aswan,2016,50,[50],[50],[50]
2,Banha,2016,200,"[200, 250, 400]","[0, 200]","[0, 200, 250, 400]"
3,Banha,2018,250,"[250, 400]","[200, 250]","[200, 250, 400]"
4,Cairo,2018,250,[250],[250],[250]
5,Aswan,2018,300,[300],[300],[300]
6,Banha,2019,400,[400],"[200, 250, 400]","[200, 250, 400]"
7,Aswan,2015,600,[600],[600],[600]
8,Cairo,2017,800,[800],[800],[800]
9,Banha,2017,1500,[1500],[1500],[1500]


### rank/dense_rank (e.g. 2nd Best Sales)

In [28]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc,max,col,sum,rank,asc

windowSpec = Window\
  .partitionBy("city")\
  .orderBy(desc("sales"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

#2nd best sales
dfA.withColumn("salesRank", rank().over(windowSpec))\
.toPandas()

#Try also
#.where("salesRank=2").toPandas() 
#.orderBy("city", "year")

Unnamed: 0,city,year,sales,salesRank
0,Aswan,2015,600,1
1,Aswan,2018,300,2
2,Aswan,2016,50,3
3,Banha,2017,1500,1
4,Banha,2019,400,2
5,Banha,2018,250,3
6,Banha,2016,200,4
7,Banha,2015,0,5
8,Cairo,2017,800,1
9,Cairo,2018,250,2


### other window functions 

- Ranking functions:	`rank dense_rank, percent_rank, ntile, row_number`
- Analytic functions:	`cume_dist, first_value, last_value, lag, lead`

In [29]:
windowSpec = windowSpec = Window.partitionBy("city").orderBy(desc("sales"))


dfA.withColumn("lag",F.lag("sales",1).over(windowSpec)).show()

+-----+----+-----+----+
| city|year|sales| lag|
+-----+----+-----+----+
|Aswan|2015|  600|NULL|
|Aswan|2018|  300| 600|
|Aswan|2016|   50| 300|
|Banha|2017| 1500|NULL|
|Banha|2019|  400|1500|
|Banha|2018|  250| 400|
|Banha|2016|  200| 250|
|Banha|2015|    0| 200|
|Cairo|2017|  800|NULL|
|Cairo|2018|  250| 800|
+-----+----+-----+----+



## Book Example

In [30]:
from pyspark.sql.functions import col, to_date
#dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "yyyy-MM-d H:mm:ss"))
dfWithDate.createOrReplaceTempView("dfWithDate")
dfWithDate.limit(5).toPandas()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,date
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,2010-12-01
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,2010-12-01
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,2010-12-01


### Window Spec

In [31]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
  .partitionBy("CustomerId", "date")\
  .orderBy(desc("Quantity"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

### column OVER

In [32]:
from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [33]:
type(maxPurchaseQuantity)

pyspark.sql.column.Column

### window aggregation function: `Rank`

In [34]:
from pyspark.sql.functions import dense_rank, rank, col, asc
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)

In [35]:
type(purchaseDenseRank)

pyspark.sql.column.Column

In [36]:
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).limit(10).toPandas()

Unnamed: 0,CustomerId,date,Quantity,quantityRank,quantityDenseRank,maxPurchaseQuantity
0,12431.0,2010-12-01,24,1,1,24
1,12431.0,2010-12-01,24,1,1,24
2,12431.0,2010-12-01,12,3,2,24
3,12431.0,2010-12-01,8,4,3,24
4,12431.0,2010-12-01,6,5,4,24
5,12431.0,2010-12-01,6,5,4,24
6,12431.0,2010-12-01,6,5,4,24
7,12431.0,2010-12-01,4,8,5,24
8,12431.0,2010-12-01,4,8,5,24
9,12431.0,2010-12-01,4,8,5,24


# Mulit-level grouping

## Toy Example

### Grouping Sets

In [37]:
data = [("Aswan",2017,800),("Banha",2018,100),("Cairo",2018,200), ("Aswan",2018,300),("Banha",2016,200),
         ("Banha",2015,0),("Aswan",2015,600),("Banha",2017,1500),("Cairo",2017,800),("Aswan",2016,200)]
header = ("city", "year", "sales")
dfA = spark.createDataFrame(data,header)
dfA.createOrReplaceTempView("tblSales")

q  = """ select sum(sales) from tblSales"""

q1 = """ select city, sum(sales) 
    from tblSales
    group by city
    order by city desc
"""

q2 = """ 
  select year, sum(sales) 
    from tblSales
    group by year
    order by year asc
"""

q3 = """ 
  select city, year, sum(sales) 
    from tblSales
    group by city, year
    order by city desc, year asc
"""

q4 = """ 
  select city, year, sum(sales), avg(sales) 
    from tblSales
    group by city, year GROUPING SETS((city, year), city, year, ())
    order by city desc nulls last, year asc nulls last
"""
spark.sql(q).show()
spark.sql(q1).show()
spark.sql(q2).show()
spark.sql(q3).show()
spark.sql(q4).show(50)

+----------+
|sum(sales)|
+----------+
|      4700|
+----------+

+-----+----------+
| city|sum(sales)|
+-----+----------+
|Cairo|      1000|
|Banha|      1800|
|Aswan|      1900|
+-----+----------+

+----+----------+
|year|sum(sales)|
+----+----------+
|2015|       600|
|2016|       400|
|2017|      3100|
|2018|       600|
+----+----------+

+-----+----+----------+
| city|year|sum(sales)|
+-----+----+----------+
|Cairo|2017|       800|
|Cairo|2018|       200|
|Banha|2015|         0|
|Banha|2016|       200|
|Banha|2017|      1500|
|Banha|2018|       100|
|Aswan|2015|       600|
|Aswan|2016|       200|
|Aswan|2017|       800|
|Aswan|2018|       300|
+-----+----+----------+

+-----+----+----------+------------------+
| city|year|sum(sales)|        avg(sales)|
+-----+----+----------+------------------+
|Cairo|2017|       800|             800.0|
|Cairo|2018|       200|             200.0|
|Cairo|NULL|      1000|             500.0|
|Banha|2015|         0|               0.0|
|Banha|2016|     

###  Rollups

In [38]:
data = [("Aswan",2017,800),("Banha",2018,100),("Cairo",2018,200), ("Aswan",2018,300),("Banha",2016,200),
         ("Banha",2015,0),("Aswan",2015,600),("Banha",2017,1500),("Cairo",2017,800),("Aswan",2016,200)]
header = ("city", "year", "sales")
dfA = spark.createDataFrame(data,header)
dfA.toPandas()

Unnamed: 0,city,year,sales
0,Aswan,2017,800
1,Banha,2018,100
2,Cairo,2018,200
3,Aswan,2018,300
4,Banha,2016,200
5,Banha,2015,0
6,Aswan,2015,600
7,Banha,2017,1500
8,Cairo,2017,800
9,Aswan,2016,200


In [39]:
from pyspark.sql.functions import collect_list,count
rollup = dfA.rollup("city","year").agg(collect_list("sales"),count("sales"))
rollup.createOrReplaceTempView("R")
spark.sql("select * from R order by city desc nulls last, year nulls last").show(50)

+-----+----+--------------------+------------+
| city|year| collect_list(sales)|count(sales)|
+-----+----+--------------------+------------+
|Cairo|2017|               [800]|           1|
|Cairo|2018|               [200]|           1|
|Cairo|NULL|          [200, 800]|           2|
|Banha|2015|                 [0]|           1|
|Banha|2016|               [200]|           1|
|Banha|2017|              [1500]|           1|
|Banha|2018|               [100]|           1|
|Banha|NULL| [100, 200, 0, 1500]|           4|
|Aswan|2015|               [600]|           1|
|Aswan|2016|               [200]|           1|
|Aswan|2017|               [800]|           1|
|Aswan|2018|               [300]|           1|
|Aswan|NULL|[800, 300, 600, 200]|           4|
| NULL|NULL|[800, 100, 200, 3...|          10|
+-----+----+--------------------+------------+



In [0]:
rollup2 = dfA.rollup("city","year").agg(sum("sales"),count("sales"))
rollup2.createOrReplaceTempView("R")
spark.sql("select * from R order by city desc nulls last, year nulls last").show(50)

+-----+----+----------+------------+
| city|year|sum(sales)|count(sales)|
+-----+----+----------+------------+
|Cairo|2017|       800|           1|
|Cairo|2018|       200|           1|
|Cairo|null|      1000|           2|
|Banha|2015|         0|           1|
|Banha|2016|       200|           1|
|Banha|2017|      1500|           1|
|Banha|2018|       100|           1|
|Banha|null|      1800|           4|
|Aswan|2015|       600|           1|
|Aswan|2016|       200|           1|
|Aswan|2017|       800|           1|
|Aswan|2018|       300|           1|
|Aswan|null|      1900|           4|
| null|null|      4700|          10|
+-----+----+----------+------------+



### Cube

In [0]:
from pyspark.sql.functions import expr
cube = dfA.cube("city", "year").agg(collect_list("sales"), count("sales"))
cube.createOrReplaceTempView("cube")
spark.sql("select * from cube order by city desc nulls last, year nulls last").show(50)

+-----+----+--------------------+------------+
| city|year| collect_list(sales)|count(sales)|
+-----+----+--------------------+------------+
|Cairo|2017|               [800]|           1|
|Cairo|2018|               [200]|           1|
|Cairo|null|          [200, 800]|           2|
|Banha|2015|                 [0]|           1|
|Banha|2016|               [200]|           1|
|Banha|2017|              [1500]|           1|
|Banha|2018|               [100]|           1|
|Banha|null| [100, 200, 0, 1500]|           4|
|Aswan|2015|               [600]|           1|
|Aswan|2016|               [200]|           1|
|Aswan|2017|               [800]|           1|
|Aswan|2018|               [300]|           1|
|Aswan|null|[800, 300, 600, 200]|           4|
| null|2015|            [0, 600]|           2|
| null|2016|          [200, 200]|           2|
| null|2017|    [800, 1500, 800]|           3|
| null|2018|     [100, 200, 300]|           3|
| null|null|[800, 100, 200, 3...|          10|
+-----+----+-

# Pivot

## Simple Pivot

In [0]:
data = [(200,"Gas","Car","oct"),
(15,"lunch","Food","oct"),
(15,"lunch","Food","oct"),
(80,"Dinner","Food","oct"),
(400,"Phone","Phone","sep"),
(500,"Maintenace","Car","sep"),
(300,"parking","Car","sep"),
(15,"lunch","Food","sep")]

expDF = spark.createDataFrame(data, ["Amount", "Item", "Category", "Month"])
expDF.toPandas()

Unnamed: 0,Amount,Item,Category,Month
0,200,Gas,Car,oct
1,15,lunch,Food,oct
2,15,lunch,Food,oct
3,80,Dinner,Food,oct
4,400,Phone,Phone,sep
5,500,Maintenace,Car,sep
6,300,parking,Car,sep
7,15,lunch,Food,sep


In [0]:
pivotDF = expDF.groupBy("Category")\
               .pivot("Month", ['oct','sep'])\
               .sum("Amount")\
               .fillna(0)
pivotDF.show()


In [0]:
the_months = ["oct","sep"]
pivotDF = expDF.groupBy("Category").pivot("Month", the_months).sum("Amount")
pivotDF.show()

In [0]:
l = []
seperator = ','
A = ["oct","sep"]
B = seperator.join(A).split(",")
n = len(A)

for a in range(n):
  l.append("'{}'".format(A[a]) + "," +B[a])
  
print(seperator.join(l))

In [0]:
pivotDF.createOrReplaceTempView("pivotDF")

In [0]:
spark.sql("SELECT Category, stack(2, 'oct',oct,'sep',sep) as  (Month, Amount) from pivotDF ").show()

In [0]:
def ReturnInt(M):
    result=9
    if M=='oct':
        result=10
    return result

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

ReturnInt_udf = udf(ReturnInt, IntegerType())

In [0]:
expDF=expDF.withColumn('IntMonth',ReturnInt_udf('Month'))

In [0]:
expDF.withColumn('IntMonth2', F.when(F.col('Month')=='oct',10).\
                when(F.col('Month')=='sep',9)).show()


In [0]:
expDF.show()

In [0]:
#### Assignment: Write the same function in Scala and call it in Python

In [0]:
data = [
(300,"oct"),
(450,"oct"),
(342,"oct"),(700,"oct"),
(400,"Nov"),
(500,"Nov"),
(300,"Dec")]
NewSales = spark.createDataFrame(data, ["NewSales", "NewMonth"])

In [0]:
expDF.show()

In [0]:
NewSales.show()

In [0]:
Inner=expDF.join(NewSales,expDF['Month']==NewSales['NewMonth'],'inner')
Inner.show()

In [0]:
test=expDF.join(NewSales,expDF['Month']==NewSales['NewMonth'],'leftanti')
test.show()

In [0]:
test=NewSales.join(expDF,expDF['Month']==NewSales['NewMonth'],'leftanti')
test.show()

+--------+--------+
|NewSales|NewMonth|
+--------+--------+
|     300|     Dec|
|     400|     Nov|
|     500|     Nov|
+--------+--------+



In [0]:
from pyspark.sql.functions import broadcast

Inner_broadcast =expDF.join(broadcast(NewSales),expDF['Month']==NewSales['NewMonth'],'inner')
Inner_broadcast.show()

In [0]:
rdd = sc.parallelize([2,3,4,5])
