In [0]:
%fs
ls "mnt/bronze"

In [0]:
hospital_admissions = spark.read.csv("/mnt/bronze/hospital_admissions.csv", header=True, inferSchema = True)

In [0]:
hospital_admissions.show()

In [0]:
columns_needed = ['country','indicator','date','year_week','value']
hospital_admissions_df = hospital_admissions.select(columns_needed)
hospital_admissions_df.show()

Going to format the dates to make it easier to use

In [0]:
from pyspark.sql.functions import col

# Create a new week column with the last 2 characters of the year_week column
hospital_admissions_df = hospital_admissions_df.withColumn("week", col("year_week").substr(7, 2))


hospital_admissions_df.show()

# Get rid of year_week column
hospital_admissions_df  = hospital_admissions_df.drop('year_week')

In [0]:
unique_indicator = hospital_admissions_df.select('indicator').distinct()
unique_indicator.show()

In [0]:
filtered_data = hospital_admissions_df.filter((hospital_admissions_df['country'] == 'Austria') & (hospital_admissions_df['week'] == '17'))
filtered_data.show()

I think making a Daily table and Weekly table would make sense. So summing up the data for each week and having a weekly table which I will do in a seperate notebook so I can run it in ADF weekly

In [0]:
from pyspark.sql import functions as F
# Get the week start date
week_start_df = hospital_admissions_df.groupBy("week").agg(
    F.min("date").alias("week_start_date")  # Get the earliest date for each week
)

# Show the resulting DataFrame with week start dates
week_start_df.show()

In [0]:
# Join to the main dataframe to add week start
hospital_admissions_df = hospital_admissions_df.join(
    week_start_df, on="week", how="left"  
)

hospital_admissions_df.show()

In [0]:
# Now make a new DataFrame for the weekly counts
weekly_hospital_admissions = hospital_admissions_df.groupBy(['indicator','country','week','week_start_date']).sum()
weekly_hospital_admissions.show()

In [0]:
# Test with one country and week
filtered_weekly = weekly_hospital_admissions.filter((weekly_hospital_admissions['country'] == 'Austria') & (weekly_hospital_admissions['week'] == '17'))
filtered_weekly.show()

I want to pivot the data to make ICU occupany and hospital occupancy a column and get rid of indicator

In [0]:
# Pivot the DataFrame based on the 'indicator' column
pivoted_df = filtered_weekly.groupBy("country", "week") \
    .pivot("indicator") \
    .agg({"sum(value)": "first"})

pivoted_df.show()

In [0]:
weekly_hospital_admissions = weekly_hospital_admissions.groupBy("country", "week").pivot("indicator").agg({"sum(value)": "first"})

weekly_hospital_admissions.show()

In [0]:
# Only grab columns we need
weekly_hospital_admissions = weekly_hospital_admissions.select("country", "week", "Daily ICU occupancy","Daily hospital occupancy")

#Rename columns as well
weekly_hospital_admissions = weekly_hospital_admissions.withColumnRenamed("Daily ICU occupancy`", "weekly_ICU_occupancy")
weekly_hospital_admissions = weekly_hospital_admissions.withColumnRenamed("Daily hospital occupancy`", "weekly_hospital_occupancy")

weekly_hospital_admissions.show()

In [0]:
# Save the transformed data into our silver container
output_path = "abfss://ecdc@adlsmzubac125.dfs.core.windows.net/silver/weeklyhospitaladmissions.csv"

# Save Dataframe as csv in data lake 
weekly_hospital_admissions.write.csv(output_path)