In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder \
    .appName('transform_testing') \
    .master('local[*]') \
    .getOrCreate()


In [3]:
df_testing = spark.read \
    .options(header=True, sep=',', inferSchema=True) \
    .csv('testing.csv')
df_testing.show()

+-------+------------+---------+---------+----------+----------+----------------+-----------------+-------------------+
|country|country_code|year_week|new_cases|tests_done|population|    testing_rate|  positivity_rate|testing_data_source|
+-------+------------+---------+---------+----------+----------+----------------+-----------------+-------------------+
|Austria|          AT| 2020-W15|     2041|     12339|   8858775|139.285623576623| 16.5410487073507| Manual webscraping|
|Austria|          AT| 2020-W16|      855|     58488|   8858775|660.226724349586| 1.46183832581042| Manual webscraping|
|Austria|          AT| 2020-W17|      472|     33443|   8858775|377.512692217603| 1.41135663666537| Manual webscraping|
|Austria|          AT| 2020-W18|      336|     26598|   8858775|300.244672655079|  1.2632528761561|    Country website|
|Austria|          AT| 2020-W19|      307|     42153|   8858775|475.833283947273|0.728299290679193|    Country website|
|Austria|          AT| 2020-W20|      36

In [4]:
df_testing.printSchema()

root
 |-- country: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- year_week: string (nullable = true)
 |-- new_cases: integer (nullable = true)
 |-- tests_done: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- testing_rate: double (nullable = true)
 |-- positivity_rate: double (nullable = true)
 |-- testing_data_source: string (nullable = true)



In [12]:
df_dim_date = spark.read \
    .options(header=True, sep=',', inferSchema=True) \
    .csv('dim_date.csv')
df_dim_date = df_dim_date.withColumn('date', f.expr("cast(date as DATE)"))
df_dim_date.printSchema()
df_dim_date.show()

root
 |-- date_key: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- day_name: string (nullable = true)
 |-- day_of_year: integer (nullable = true)
 |-- week_of_month: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- month_name: string (nullable = true)
 |-- year_month: integer (nullable = true)
 |-- year_week: integer (nullable = true)

+--------+----------+----+-----+---+---------+-----------+-------------+------------+----------+----------+---------+
|date_key|      date|year|month|day| day_name|day_of_year|week_of_month|week_of_year|month_name|year_month|year_week|
+--------+----------+----+-----+---+---------+-----------+-------------+------------+----------+----------+---------+
|20200101|2020-01-01|2020|    1|  1|Wednesday|          1|            1|           1|   January|    202001|   202001|
|20200102|2020-01-02|2020|    1|  

In [6]:
df_dim_country = spark.read \
    .options(header=True, sep=',', inferSchema=True) \
    .csv('country_lookup.csv')
df_dim_country.printSchema()
df_dim_country.show()

root
 |-- country: string (nullable = true)
 |-- country_code_2_digit: string (nullable = true)
 |-- country_code_3_digit: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- population: integer (nullable = true)

+--------------------+--------------------+--------------------+---------+----------+
|             country|country_code_2_digit|country_code_3_digit|continent|population|
+--------------------+--------------------+--------------------+---------+----------+
|               Aruba|                  AW|                 ABW|  America|    106766|
|         Afghanistan|                  AF|                 AFG|     Asia|  38928341|
|              Angola|                  AO|                 AGO|   Africa|  32866268|
|            Anguilla|                  AI|                 AIA|  America|     15002|
|             Albania|                  AL|                 ALB|   Europe|   2862427|
|             Andorra|                  AD|                 AND|   Europe|     

In [13]:
df_testing.createOrReplaceTempView('testing')
df_dim_date.createOrReplaceTempView('dim_date')
df_dim_country.createOrReplaceTempView('dim_country')

In [14]:
df_proccessed_testing = spark.sql("""
SELECT t.country  ,
      c.country_code_2_digit   ,
      c.country_code_3_digit   ,
      t.year_week   ,
      MIN(d.date) AS week_start_date, 
      MAX(d.date) AS week_end_date ,
      t.new_cases   ,
      t.tests_done  ,
      t.population  , 
      t.testing_rate   ,
      t.positivity_rate   ,
      t.testing_data_source  
FROM testing t JOIN dim_date d ON t.year_week = concat(concat(d.year, '-W'), LPAD(d.week_of_year, 2, '0'))
    JOIN dim_country c ON t.country_code = c.country_code_2_digit
GROUP BY t.country  ,
      c.country_code_2_digit   ,
      c.country_code_3_digit   ,
      t.year_week   ,
      t.new_cases   ,
      t.tests_done  ,
      t.population  , 
      t.testing_rate   ,
      t.positivity_rate   ,
      t.testing_data_source  ;
""")
df_proccessed_testing.show()

+--------------+--------------------+--------------------+---------+---------------+-------------+---------+----------+----------+-----------------+-----------------+-------------------+
|       country|country_code_2_digit|country_code_3_digit|year_week|week_start_date|week_end_date|new_cases|tests_done|population|     testing_rate|  positivity_rate|testing_data_source|
+--------------+--------------------+--------------------+---------+---------------+-------------+---------+----------+----------+-----------------+-----------------+-------------------+
|       Finland|                  FI|                 FIN| 2020-W36|     2020-08-30|   2020-09-05|      212|    102435|   5517919| 1856.40637348972|0.206960511543906|        Country API|
|       Ireland|                  IE|                 IRL| 2020-W42|     2020-10-11|   2020-10-17|     6964|    112134|   4904240| 2286.47048268437| 6.21042681077996|              TESSy|
|        Poland|                  PL|                 POL| 2020-W

In [15]:
df_proccessed_testing.printSchema()

root
 |-- country: string (nullable = true)
 |-- country_code_2_digit: string (nullable = true)
 |-- country_code_3_digit: string (nullable = true)
 |-- year_week: string (nullable = true)
 |-- week_start_date: date (nullable = true)
 |-- week_end_date: date (nullable = true)
 |-- new_cases: integer (nullable = true)
 |-- tests_done: integer (nullable = true)
 |-- population: integer (nullable = true)
 |-- testing_rate: double (nullable = true)
 |-- positivity_rate: double (nullable = true)
 |-- testing_data_source: string (nullable = true)

