In [2]:
# -----------------------------------------------
# Day 1: From Raw CSV to Clean Delta Table
# Microsoft Fabric | PySpark Notebook
# -----------------------------------------------

# STEP 1: Load raw CSV from OneLake Files
df = spark.read.option("header", True).csv("Files/covid_data/owid-covid-data.csv")

# STEP 2: Initial exploration
print("Original Schema:")
df.printSchema()

print("Sample Records:")
df.show(5)

StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 4, Finished, Available, Finished)

Original Schema:
root
 |-- iso_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- location: string (nullable = true)
 |-- date: string (nullable = true)
 |-- total_cases: string (nullable = true)
 |-- new_cases: string (nullable = true)
 |-- new_cases_smoothed: string (nullable = true)
 |-- total_deaths: string (nullable = true)
 |-- new_deaths: string (nullable = true)
 |-- new_deaths_smoothed: string (nullable = true)
 |-- total_cases_per_million: string (nullable = true)
 |-- new_cases_per_million: string (nullable = true)
 |-- new_cases_smoothed_per_million: string (nullable = true)
 |-- total_deaths_per_million: string (nullable = true)
 |-- new_deaths_per_million: string (nullable = true)
 |-- new_deaths_smoothed_per_million: string (nullable = true)
 |-- reproduction_rate: string (nullable = true)
 |-- icu_patients: string (nullable = true)
 |-- icu_patients_per_million: string (nullable = true)
 |-- hosp_patients: string (nullable = true)
 |-- hosp_pat

In [3]:
from pyspark.sql.types import DoubleType

numeric_cols = [
    "total_cases", "new_cases", "total_deaths", "new_deaths",
    "total_vaccinations", "people_vaccinated", "people_fully_vaccinated",
    "new_vaccinations"
]

for col_name in numeric_cols:
    df = df.withColumn(col_name, df[col_name].cast(DoubleType()))

df.printSchema

StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 5, Finished, Available, Finished)

<bound method DataFrame.printSchema of DataFrame[iso_code: string, continent: string, location: string, date: string, total_cases: double, new_cases: double, new_cases_smoothed: string, total_deaths: double, new_deaths: double, new_deaths_smoothed: string, total_cases_per_million: string, new_cases_per_million: string, new_cases_smoothed_per_million: string, total_deaths_per_million: string, new_deaths_per_million: string, new_deaths_smoothed_per_million: string, reproduction_rate: string, icu_patients: string, icu_patients_per_million: string, hosp_patients: string, hosp_patients_per_million: string, weekly_icu_admissions: string, weekly_icu_admissions_per_million: string, weekly_hosp_admissions: string, weekly_hosp_admissions_per_million: string, total_tests: string, new_tests: string, total_tests_per_thousand: string, new_tests_per_thousand: string, new_tests_smoothed: string, new_tests_smoothed_per_thousand: string, positive_rate: string, tests_per_case: string, tests_units: string

In [4]:
df = df.fillna({
    "total_cases": 0,
    "new_cases": 0,
    "total_deaths": 0,
    "new_deaths": 0,
    "total_vaccinations": 0,
    "new_vaccinations": 0
})


StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 6, Finished, Available, Finished)

In [8]:
from pyspark.sql.functions import to_date
df = df.withColumn("date", to_date("date","yyyy-MM-dd"))

StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 10, Finished, Available, Finished)

In [9]:
from pyspark.sql.functions import year, month
df = df.withColumn("year", year("date")).withColumn("month", month("date"))

StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 11, Finished, Available, Finished)

In [11]:
from pyspark.sql.functions import when, col
df = df.withColumn(
    "death_rate_pct",
    when(col("total_cases") > 0, (col("total_deaths") / col("total_cases")) * 100).otherwise(None) 
)

StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 13, Finished, Available, Finished)

In [12]:
df = df.withColumn(
    "vaccinated_per_100",
    when( col("population") > 0, (col("total_vaccinations") / col("population")) * 100).otherwise(None)
)

StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 14, Finished, Available, Finished)

In [14]:
spark.sql("DROP TABLE IF EXISTS cleaned_covid_data")
df.write.mode("overwrite").saveAsTable("cleaned_covid_data")


StatementMeta(, 7095b4dc-6367-463b-b4b9-ef025051b548, 16, Finished, Available, Finished)