In [1]:
%run utilities.ipynb

In [2]:
df_hospital_admissions = spark.read.csv("Datasets/hospital_admissions.csv", header=True, inferSchema=True)
df_country_lookup = spark.read.csv("Datasets/country_lookup.csv", header=True, inferSchema=True)

In [51]:
df_hospital_admissions.show()

+-------+--------------------+----------+---------+------+---------------+--------------------+
|country|           indicator|      date|year_week| value|         source|                 url|
+-------+--------------------+----------+---------+------+---------------+--------------------+
|Austria|Daily hospital oc...|2020-04-02| 2020-W14|1057.0|   Surveillance|https://www.sozia...|
|Austria|Daily hospital oc...|2020-04-08| 2020-W15|1096.0|   Surveillance|https://www.sozia...|
|Austria|Daily hospital oc...|2020-04-15| 2020-W16|1001.0|   Surveillance|https://info.gesu...|
|Austria|Daily hospital oc...|2020-04-16| 2020-W16| 967.0|   Surveillance|https://www.sozia...|
|Austria|Daily hospital oc...|2020-04-17| 2020-W16| 909.0|   Surveillance|https://www.sozia...|
|Austria|Daily hospital oc...|2020-04-19| 2020-W16| 817.0|   Surveillance|https://www.sozia...|
|Austria|Daily hospital oc...|2020-04-21| 2020-W17| 756.0|   Surveillance|https://www.sozia...|
|Austria|Daily hospital oc...|2020-04-22

In [52]:
df_hospital_admissions.dtypes

[('country', 'string'),
 ('indicator', 'string'),
 ('date', 'date'),
 ('year_week', 'string'),
 ('value', 'double'),
 ('source', 'string'),
 ('url', 'string')]

In [3]:
df_hospital_admissions = df_hospital_admissions.drop('url')

In [54]:
df_hospital_admissions.show()

+-------+--------------------+----------+---------+------+---------------+
|country|           indicator|      date|year_week| value|         source|
+-------+--------------------+----------+---------+------+---------------+
|Austria|Daily hospital oc...|2020-04-02| 2020-W14|1057.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-08| 2020-W15|1096.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-15| 2020-W16|1001.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-16| 2020-W16| 967.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-17| 2020-W16| 909.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-19| 2020-W16| 817.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-21| 2020-W17| 756.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-22| 2020-W17| 700.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-23| 2020-W17| 677.0|   Surveillance|
|Austria|Daily hospital oc...|2020-04-24| 2020-W17| 651.0|   Surveillance|
|Austria|Daily hospital o

In [55]:
df_country_lookup.show()

+--------------------+--------------------+--------------------+---------+----------+
|             country|country_code_2_digit|country_code_3_digit|continent|population|
+--------------------+--------------------+--------------------+---------+----------+
|               Aruba|                  AW|                 ABW|  America|    106766|
|         Afghanistan|                  AF|                 AFG|     Asia|  38928341|
|              Angola|                  AO|                 AGO|   Africa|  32866268|
|            Anguilla|                  AI|                 AIA|  America|     15002|
|             Albania|                  AL|                 ALB|   Europe|   2862427|
|             Andorra|                  AD|                 AND|   Europe|     76177|
|United Arab Emirates|                  AE|                 ARE|     Asia|   9890400|
|           Argentina|                  AR|                 ARG|  America|  45195777|
|             Armenia|                  AM|           

In [56]:
df_country_lookup.dtypes

[('country', 'string'),
 ('country_code_2_digit', 'string'),
 ('country_code_3_digit', 'string'),
 ('continent', 'string'),
 ('population', 'int')]

In [4]:
condition = [df_hospital_admissions.country == df_country_lookup.country]

fields = [
    df_hospital_admissions['country']
    ,'country_code_2_digit'
    ,'country_code_3_digit'
    ,'indicator'
    ,'reported_date'
    ,'reported_year_week'
    ,'population'
    ,'value'
    ,'source'
]

df_hospital_admissions = (
    df_hospital_admissions
    .join(df_country_lookup, on=condition, how='left')
    .withColumnRenamed('date', 'reported_date')
    .withColumnRenamed('year_week', 'reported_year_week')
    .select(fields)
)

In [58]:
df_hospital_admissions.show()

+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+------+---------------+
|country|country_code_2_digit|country_code_3_digit|           indicator|reported_date|reported_year_week|population| value|         source|
+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+------+---------------+
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-02|          2020-W14|   8858775|1057.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-08|          2020-W15|   8858775|1096.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-15|          2020-W16|   8858775|1001.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-16|          2020-W16|   8858775| 967.0|   Surveillance|
|Austria|           

In [59]:
df_hospital_admissions.count()

8977

In [60]:
df_hospital_admissions.select('indicator').distinct().collect()

[Row(indicator='Daily hospital occupancy'),
 Row(indicator='Daily ICU occupancy'),
 Row(indicator='Weekly new hospital admissions per 100k'),
 Row(indicator='Weekly new ICU admissions per 100k')]

In [5]:
# DERIVED DATASETS

df_daily_hospital_admissions = (
    df_hospital_admissions
    .where((col('indicator') == 'Daily hospital occupancy') | (col('indicator') == 'Daily ICU occupancy'))
)

df_weekly_hospital_admissions = (
    df_hospital_admissions
    .where((col('indicator') == 'Weekly new hospital admissions per 100k') | (col('indicator') == 'Weekly new ICU admissions per 100k'))
)

In [41]:
df_daily_hospital_admissions.show()

+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+------+---------------+
|country|country_code_2_digit|country_code_3_digit|           indicator|reported_date|reported_year_week|population| value|         source|
+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+------+---------------+
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-02|          2020-W14|   8858775|1057.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-08|          2020-W15|   8858775|1096.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-15|          2020-W16|   8858775|1001.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-16|          2020-W16|   8858775| 967.0|   Surveillance|
|Austria|           

In [38]:
df_daily_hospital_admissions.count()

8110

In [42]:
df_weekly_hospital_admissions.show()

+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+----------------+--------------------+
|country|country_code_2_digit|country_code_3_digit|           indicator|reported_date|reported_year_week|population|           value|              source|
+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+----------------+--------------------+
|Belgium|                  BE|                 BEL|Weekly new hospit...|         NULL|          2020-W06|  11455519|            NULL|TESSy COVID-19, n...|
|Belgium|                  BE|                 BEL|Weekly new hospit...|         NULL|          2020-W07|  11455519|            NULL|TESSy COVID-19, n...|
|Belgium|                  BE|                 BEL|Weekly new hospit...|         NULL|          2020-W08|  11455519|            NULL|TESSy COVID-19, n...|
|Belgium|                  BE|                 BEL|Weekly new hospit..

In [39]:
df_weekly_hospital_admissions.count()

867

In [6]:
df_dim_date = spark.read.csv("Datasets/dim_date.csv", header=True, inferSchema=True)

In [3]:
df_dim_date.show()

+--------+----------+----+-----+---+---------+-----------+-------------+------------+----------+----------+---------+
|date_key|      date|year|month|day| day_name|day_of_year|week_of_month|week_of_year|month_name|year_month|year_week|
+--------+----------+----+-----+---+---------+-----------+-------------+------------+----------+----------+---------+
|20200101|2020-01-01|2020|    1|  1|Wednesday|          1|            1|           1|   January|    202001|   202001|
|20200102|2020-01-02|2020|    1|  2| Thursday|          2|            1|           1|   January|    202001|   202001|
|20200103|2020-01-03|2020|    1|  3|   Friday|          3|            1|           1|   January|    202001|   202001|
|20200104|2020-01-04|2020|    1|  4| Saturday|          4|            1|           1|   January|    202001|   202001|
|20200105|2020-01-05|2020|    1|  5|   Sunday|          5|            2|           2|   January|    202001|   202002|
|20200106|2020-01-06|2020|    1|  6|   Monday|          

In [7]:
fields = [
    'ecdc_year_week'
    ,'start_week_date'
    ,'end_week_date'
]

aggregated_dim_date = (
    df_dim_date
    .withColumn('ecdc_year_week', concat('year',lit('-W'), lpad('week_of_year', 2, '0')))
    .groupBy('ecdc_year_week')
    .agg(
        min('date').alias('start_week_date')
        ,max('date').alias('end_week_date')
    )
    .select(fields)
)

In [24]:
aggregated_dim_date.show()

+--------------+---------------+-------------+
|ecdc_year_week|start_week_date|end_week_date|
+--------------+---------------+-------------+
|      2020-W32|     2020-08-02|   2020-08-08|
|      2022-W39|     2022-09-18|   2022-09-24|
|      2020-W47|     2020-11-15|   2020-11-21|
|      2020-W10|     2020-03-01|   2020-03-07|
|      2020-W13|     2020-03-22|   2020-03-28|
|      2021-W53|     2021-12-26|   2021-12-31|
|      2020-W12|     2020-03-15|   2020-03-21|
|      2021-W42|     2021-10-10|   2021-10-16|
|      2021-W45|     2021-10-31|   2021-11-06|
|      2020-W44|     2020-10-25|   2020-10-31|
|      2022-W38|     2022-09-11|   2022-09-17|
|      2021-W22|     2021-05-23|   2021-05-29|
|      2020-W08|     2020-02-16|   2020-02-22|
|      2020-W34|     2020-08-16|   2020-08-22|
|      2022-W17|     2022-04-17|   2022-04-23|
|      2021-W51|     2021-12-12|   2021-12-18|
|      2021-W43|     2021-10-17|   2021-10-23|
|      2020-W26|     2020-06-21|   2020-06-27|
|      2020-W

In [40]:
df_daily_hospital_admissions.show()

+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+------+---------------+
|country|country_code_2_digit|country_code_3_digit|           indicator|reported_date|reported_year_week|population| value|         source|
+-------+--------------------+--------------------+--------------------+-------------+------------------+----------+------+---------------+
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-02|          2020-W14|   8858775|1057.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-08|          2020-W15|   8858775|1096.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-15|          2020-W16|   8858775|1001.0|   Surveillance|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-16|          2020-W16|   8858775| 967.0|   Surveillance|
|Austria|           

In [8]:
condition = [df_daily_hospital_admissions.reported_year_week == aggregated_dim_date.ecdc_year_week]

fields = [
    'country'
    ,'country_code_2_digit'
    ,'country_code_3_digit'
    ,'indicator'
    ,'reported_date'
    ,'population'
    ,'value'
    ,'source'
    ,'ecdc_year_week'
    ,'start_week_date'
    ,'end_week_date'
]

df_daily_hospital_admissions = (
    df_daily_hospital_admissions
    .join(aggregated_dim_date, on=condition, how='left')
    .select(fields)
)

In [9]:
df_daily_hospital_admissions.show()

+-------+--------------------+--------------------+--------------------+-------------+----------+------+---------------+--------------+---------------+-------------+
|country|country_code_2_digit|country_code_3_digit|           indicator|reported_date|population| value|         source|ecdc_year_week|start_week_date|end_week_date|
+-------+--------------------+--------------------+--------------------+-------------+----------+------+---------------+--------------+---------------+-------------+
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-02|   8858775|1057.0|   Surveillance|      2020-W14|     2020-03-29|   2020-04-04|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-08|   8858775|1096.0|   Surveillance|      2020-W15|     2020-04-05|   2020-04-11|
|Austria|                  AT|                 AUT|Daily hospital oc...|   2020-04-15|   8858775|1001.0|   Surveillance|      2020-W16|     2020-04-12|   2020-04-18|
|Aus