In [0]:
from pyspark.sql import functions as f

In [0]:
storage_acc_name = "covidminhdl"
testing_path = f"/mnt/{storage_acc_name}/raw/ecdc/testing"
dim_date_path = f"/mnt/{storage_acc_name}/lookup/dim_date"
dim_country_path = f"/mnt/{storage_acc_name}/lookup/dim_country"
sink_path = f"/mnt/{storage_acc_name}/processed/ecdc/testing"

In [0]:
df_testing = spark.read \
    .options(header=True, sep=',', inferSchema=True) \
    .csv(testing_path)
# df_testing.show()

In [0]:
df_dim_date = spark.read \
    .options(header=True, sep=',', inferSchema=True) \
    .csv(dim_date_path)
df_dim_date = df_dim_date.withColumn('date', f.expr("cast(date as DATE)"))

In [0]:
df_dim_country = spark.read \
    .options(header=True, sep=',', inferSchema=True) \
    .csv(dim_country_path)

In [0]:
df_testing.createOrReplaceTempView('testing')
df_dim_date.createOrReplaceTempView('dim_date')
df_dim_country.createOrReplaceTempView('dim_country')

In [0]:
df_proccessed_testing = spark.sql("""
SELECT t.country  ,
      c.country_code_2_digit   ,
      c.country_code_3_digit   ,
      t.year_week   ,
      MIN(d.date) AS week_start_date, 
      MAX(d.date) AS week_end_date ,
      t.new_cases   ,
      t.tests_done  ,
      t.population  , 
      t.testing_rate   ,
      t.positivity_rate   ,
      t.testing_data_source  
FROM testing t JOIN dim_date d ON t.year_week = concat(concat(d.year, '-W'), LPAD(d.week_of_year, 2, '0'))
    JOIN dim_country c ON t.country_code = c.country_code_2_digit
GROUP BY t.country  ,
      c.country_code_2_digit   ,
      c.country_code_3_digit   ,
      t.year_week   ,
      t.new_cases   ,
      t.tests_done  ,
      t.population  , 
      t.testing_rate   ,
      t.positivity_rate   ,
      t.testing_data_source  ;
""")
# df_proccessed_testing.show()

In [0]:
df_proccessed_testing.write \
    .format("com.databricks.spark.csv") \
    .options(header=True, sep=',') \
    .mode('overwrite') \
    .save(sink_path)