In [0]:
import pyspark
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='healthcarescope')]

In [0]:
dbutils.secrets.list(scope = 'healthcarescope')

[SecretMetadata(key='blobaccesskey')]

In [0]:

secret_name = dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey")

In [0]:
print(secret_name)

[REDACTED]


In [0]:
spark.conf.set(
    "fs.azure.account.key.healthcareprojectblob.dfs.core.windows.net",
    dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey"))


In [0]:
display(dbutils.fs.ls("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/claims.csv,claims.csv,5766,1725461306000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/disease.csv,disease.csv,1489,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/group.csv,group.csv,4390,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1725312201000


In [0]:
# # Mount the Azure Blob Storage container as a DBFS path
# dbutils.fs.mount(
#   source="wasbs://rawhealthdata@healthcareprojectblob.blob.core.windows.net",
#   mount_point="/mnt/rawhealthdata",
#   extra_configs={
#     "fs.azure.account.key.healthcareprojectblob.blob.core.windows.net": "0izVsgjt8yavZcTHY3jKLiEiVe0Nu7jqnlYlYXCi3eAF/WsbNH1eY0Cvvd59kRgOIpunsgWDknLx+ASteulWxw=="
#   }
# )

# # List the files in the mounted DBFS path
# dbutils.fs.ls("/mnt/rawhealthdata")

In [0]:
# Unmount the Azure Blob Storage container
# dbutils.fs.unmount("/mnt/rawhealthdata")

In [0]:
data = spark.read.csv("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/disease.csv", header=True)

In [0]:
display(data)

subgrp_id,disease_id,disease_name
S101,110001,Beriberi
S101,110002,Scurvy
S101,110003,Goitre
S101,110004,Osteoporosis
S101,110005,Rickets
S101,110006,Anaemia
S102,110007,Fractures
S102,110008,Heart Attack
S102,110009,Burns
S102,110010,Choking


In [0]:
# to do - patient name, null values/check for duplicates.

In [0]:
data.columns

['subgrp_id', 'disease_id', 'disease_name']

In [0]:
data.groupby(['subgrp_id', 'disease_id', 'disease_name']).count().where("count > 1").show(15, False)

+---------+----------+------------+-----+
|subgrp_id|disease_id|disease_name|count|
+---------+----------+------------+-----+
+---------+----------+------------+-----+



In [0]:
# check null value for all the columns
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---------+----------+------------+
|subgrp_id|disease_id|disease_name|
+---------+----------+------------+
|        0|         0|           0|
+---------+----------+------------+



In [0]:
data.select("*").filter(col("disease_name").isin(["Flu", "Bladder cancer"])).show() 

+---------+----------+--------------+
|subgrp_id|disease_id|  disease_name|
+---------+----------+--------------+
|     S107|    110039|Bladder cancer|
|     S110|    110059|           Flu|
+---------+----------+--------------+



In [0]:
# # Now we need to write this data into stagging area for the next step.
# # boilerplate code (standard code for certain operation)

# Define the output staging path
output_stagging_path = "abfss://stagginglayerhealthdata@healthcareprojectblob.dfs.core.windows.net"

# Write the data to the staging area
data.coalesce(1).write.mode("append").format("com.databricks.spark.csv").option("header", "true").option("format", "csv").save(output_stagging_path)

# List all files in the output staging path
files = dbutils.fs.ls(output_stagging_path)

# Identify part files and non-part files
part_files = [x for x in files if x.name.startswith("part-")]
non_part_files = [x for x in files if x.name.startswith("_")]

# Move the part file to the desired location
if part_files:
    dbutils.fs.mv(part_files[0].path, f"{output_stagging_path}/diseasestagging.csv")

# Remove non-part files
for file in non_part_files:
    dbutils.fs.rm(file.path)