In [0]:
hospital_admissions = spark.read.csv("/mnt/bronze/hospital_admissions.csv", header=True, inferSchema = True)
hospital_admissions.show()

In [0]:
columns_needed = ['country','indicator','date','year_week','value']
hospital_admissions_df = hospital_admissions.select(columns_needed)
hospital_admissions_df.show()

In [0]:
from pyspark.sql.functions import col

# Create a new week column with the last 2 characters of the year_week column
hospital_admissions_df = hospital_admissions_df.withColumn("week", col("year_week").substr(7, 2))


hospital_admissions_df.show()

# Get rid of year_week column
hospital_admissions_df  = hospital_admissions_df.drop('year_week')

In [0]:
from pyspark.sql import functions as F
# Get the week start date
week_start_df = hospital_admissions_df.groupBy("week").agg(
    F.min("date").alias("week_start_date")  # Get the earliest date for each week
)

# Show the resulting DataFrame with week start dates
week_start_df.show()

In [0]:
# Join to Dataframe to get the week start date
hospital_admissions_df = hospital_admissions_df.join(
    week_start_df, on="week", how="left"  
)

In [0]:
# Pivot to have indicator values as columns
hospital_admissions_df = hospital_admissions_df.groupBy("country", "date", "week").pivot("indicator").agg({"value": "first"})

In [0]:
# Get the names of the last two columns
columns_to_drop = hospital_admissions_df.columns[-2:]

# Create a new DataFrame without the last two columns
hospital_admissions_df = hospital_admissions_df.drop(*columns_to_drop)

In [0]:
# Save the transformed data into our silver layer
output_path = "abfss://ecdc@adlsmzubac125.dfs.core.windows.net/silver/dailyhospitaladmissions.csv"

# Save Dataframe as csv in data lake 
hospital_admissions_df.write.csv(output_path)