In [0]:
from pyspark.sql import functions as f

In [0]:
storage_acc_name = "covidminhdl"
population_path = f"/mnt/{storage_acc_name}/raw/population"

In [0]:
df = spark.read \
    .options(header=True, sep=r'\t') \
    .csv(population_path)

In [0]:
df1 = df.select(f.col(r'indic_de,geo\time').alias('code'), f.col('2019 ').alias('Y2019')) \
    .withColumn('country_code', f.split('code', ',')[1]) \
    .withColumn('ages_group', f.split((f.split('code', ',')[0]), 'Y')[1])
df1.createOrReplaceTempView('population')

In [0]:
df_population_pivot = spark.sql("""
SELECT cast(regexp_replace(Y2019, '[a-z]', '') as decimal(10,2)) as Y2019, country_code, ages_group
FROM population
WHERE length(country_code) == 2 
""").groupBy('country_code').pivot('ages_group').max('Y2019') \
    .withColumn('sum', f.col('0_14')+f.col('15_24')+f.col('25_49')+f.col('50_64')+f.col('65_79')+f.col('80_MAX')) \
    .orderBy('country_code')
df_population_pivot.createOrReplaceTempView('population_pivot')

In [0]:
df_country_lookup = spark.read \
    .options(header=True, sep=',') \
    .csv('dim_country/')
# df_country_lookup.show()
df_country_lookup.createOrReplaceTempView('country_lookup')

In [0]:
df_processed_population = spark.sql("""
SELECT c.country,
       c.country_code_2_digit,
       c.country_code_3_digit,
       c.population,
       p.0_14  AS age_group_0_14,
       p.15_24 AS age_group_15_24,
       p.25_49 AS age_group_25_49,
       p.50_64 AS age_group_50_64, 
       p.65_79 AS age_group_65_79,
       p.80_MAX AS age_group_80_max
FROM population_pivot p JOIN country_lookup c ON p.country_code = c.country_code_2_digit
ORDER BY c.country
""")
df_processed_population.show()

In [0]:
df_processed_population.write \
    .format("com.databricks.spark.csv") \
    .options(header=True, sep=',') \
    .mode("overwrite") \
    .save(f"/mnt/{storage_acc_name}/processed/population")