In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
timeprovince = spark.read.load("/home/rahul/projects/sparkdf/coronavirusdataset/TimeProvince.csv",format="csv", \
                        sep=",", inferSchema="true", header="true")
timeprovince.show()


+-------------------+----+-----------------+---------+--------+--------+
|               date|time|         province|confirmed|released|deceased|
+-------------------+----+-----------------+---------+--------+--------+
|2020-01-20 00:00:00|  16|            Seoul|        0|       0|       0|
|2020-01-20 00:00:00|  16|            Busan|        0|       0|       0|
|2020-01-20 00:00:00|  16|            Daegu|        0|       0|       0|
|2020-01-20 00:00:00|  16|          Incheon|        1|       0|       0|
|2020-01-20 00:00:00|  16|          Gwangju|        0|       0|       0|
|2020-01-20 00:00:00|  16|          Daejeon|        0|       0|       0|
|2020-01-20 00:00:00|  16|            Ulsan|        0|       0|       0|
|2020-01-20 00:00:00|  16|           Sejong|        0|       0|       0|
|2020-01-20 00:00:00|  16|      Gyeonggi-do|        0|       0|       0|
|2020-01-20 00:00:00|  16|       Gangwon-do|        0|       0|       0|
|2020-01-20 00:00:00|  16|Chungcheongbuk-do|       

# Ranking

In [7]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F
windowSpec = Window().partitionBy(['province']).orderBy(F.desc('confirmed'))
timeprovince.withColumn("rank",F.rank().over(windowSpec)).show()

+-------------------+----+--------+---------+--------+--------+----+
|               date|time|province|confirmed|released|deceased|rank|
+-------------------+----+--------+---------+--------+--------+----+
|2020-05-11 00:00:00|   0|  Sejong|       47|      45|       0|   1|
|2020-05-12 00:00:00|   0|  Sejong|       47|      45|       0|   1|
|2020-05-13 00:00:00|   0|  Sejong|       47|      46|       0|   1|
|2020-05-14 00:00:00|   0|  Sejong|       47|      46|       0|   1|
|2020-03-29 00:00:00|   0|  Sejong|       46|      11|       0|   5|
|2020-03-30 00:00:00|   0|  Sejong|       46|      12|       0|   5|
|2020-03-31 00:00:00|   0|  Sejong|       46|      12|       0|   5|
|2020-04-01 00:00:00|   0|  Sejong|       46|      12|       0|   5|
|2020-04-02 00:00:00|   0|  Sejong|       46|      12|       0|   5|
|2020-04-03 00:00:00|   0|  Sejong|       46|      12|       0|   5|
|2020-04-04 00:00:00|   0|  Sejong|       46|      15|       0|   5|
|2020-04-05 00:00:00|   0|  Sejong

# Lag Variables

In [8]:
from pyspark.sql.window import Window
windowSpec = Window().partitionBy(['province']).orderBy('date')
timeprovinceWithLag = timeprovince.withColumn("lag_7",F.lag("confirmed", 7).over(windowSpec))

timeprovinceWithLag.filter(timeprovinceWithLag.date>'2020-03-10').show()

+-------------------+----+--------+---------+--------+--------+-----+
|               date|time|province|confirmed|released|deceased|lag_7|
+-------------------+----+--------+---------+--------+--------+-----+
|2020-03-10 00:00:00|   0|  Sejong|        8|       0|       0|    1|
|2020-03-11 00:00:00|   0|  Sejong|       10|       0|       0|    1|
|2020-03-12 00:00:00|   0|  Sejong|       15|       0|       0|    1|
|2020-03-13 00:00:00|   0|  Sejong|       32|       0|       0|    1|
|2020-03-14 00:00:00|   0|  Sejong|       38|       0|       0|    2|
|2020-03-15 00:00:00|   0|  Sejong|       39|       0|       0|    3|
|2020-03-16 00:00:00|   0|  Sejong|       40|       0|       0|    6|
|2020-03-17 00:00:00|   0|  Sejong|       40|       0|       0|    8|
|2020-03-18 00:00:00|   0|  Sejong|       41|       0|       0|   10|
|2020-03-19 00:00:00|   0|  Sejong|       41|       0|       0|   15|
|2020-03-20 00:00:00|   0|  Sejong|       41|       0|       0|   32|
|2020-03-21 00:00:00

# Rolling Aggregations

In [9]:
from pyspark.sql.window import Window

windowSpec = Window().partitionBy(['province']).orderBy('date').rowsBetween(-6,0)
timeprovinceWithRoll = timeprovince.withColumn("roll_7_confirmed",F.mean("confirmed").over(windowSpec))
timeprovinceWithRoll.filter(timeprovinceWithLag.date>'2020-03-10').show()

+-------------------+----+--------+---------+--------+--------+------------------+
|               date|time|province|confirmed|released|deceased|  roll_7_confirmed|
+-------------------+----+--------+---------+--------+--------+------------------+
|2020-03-10 00:00:00|   0|  Sejong|        8|       0|       0| 3.142857142857143|
|2020-03-11 00:00:00|   0|  Sejong|       10|       0|       0| 4.428571428571429|
|2020-03-12 00:00:00|   0|  Sejong|       15|       0|       0| 6.428571428571429|
|2020-03-13 00:00:00|   0|  Sejong|       32|       0|       0|10.857142857142858|
|2020-03-14 00:00:00|   0|  Sejong|       38|       0|       0|              16.0|
|2020-03-15 00:00:00|   0|  Sejong|       39|       0|       0|21.142857142857142|
|2020-03-16 00:00:00|   0|  Sejong|       40|       0|       0|              26.0|
|2020-03-17 00:00:00|   0|  Sejong|       40|       0|       0|30.571428571428573|
|2020-03-18 00:00:00|   0|  Sejong|       41|       0|       0|              35.0|
|202

## Running Totals

In [11]:
from pyspark.sql.window import Window

windowSpec = Window().partitionBy(['province']).orderBy('date').rowsBetween(\
            Window.unboundedPreceding,Window.currentRow)
timeprovinceWithRoll = timeprovince.withColumn("cumulative_confirmed",F.sum("confirmed").over(windowSpec))
timeprovinceWithRoll.filter(timeprovinceWithLag.date>'2020-03-10').show()

+-------------------+----+--------+---------+--------+--------+--------------------+
|               date|time|province|confirmed|released|deceased|cumulative_confirmed|
+-------------------+----+--------+---------+--------+--------+--------------------+
|2020-03-10 00:00:00|   0|  Sejong|        8|       0|       0|                  33|
|2020-03-11 00:00:00|   0|  Sejong|       10|       0|       0|                  43|
|2020-03-12 00:00:00|   0|  Sejong|       15|       0|       0|                  58|
|2020-03-13 00:00:00|   0|  Sejong|       32|       0|       0|                  90|
|2020-03-14 00:00:00|   0|  Sejong|       38|       0|       0|                 128|
|2020-03-15 00:00:00|   0|  Sejong|       39|       0|       0|                 167|
|2020-03-16 00:00:00|   0|  Sejong|       40|       0|       0|                 207|
|2020-03-17 00:00:00|   0|  Sejong|       40|       0|       0|                 247|
|2020-03-18 00:00:00|   0|  Sejong|       41|       0|       0|  