In [0]:
import pyspark
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='healthcarescope')]

In [0]:
dbutils.secrets.list(scope = 'healthcarescope')

[SecretMetadata(key='blobaccesskey')]

In [0]:

secret_name = dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey")

In [0]:
print(secret_name)

[REDACTED]


In [0]:
spark.conf.set(
    "fs.azure.account.key.healthcareprojectblob.dfs.core.windows.net",
    dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey"))


In [0]:
display(dbutils.fs.ls("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/claims.csv,claims.csv,5766,1725643041000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/disease.csv,disease.csv,1489,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/group.csv,group.csv,4390,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1725312201000


In [0]:
# # Mount the Azure Blob Storage container as a DBFS path
# dbutils.fs.mount(
#   source="wasbs://rawhealthdata@healthcareprojectblob.blob.core.windows.net",
#   mount_point="/mnt/rawhealthdata",
#   extra_configs={
#     "fs.azure.account.key.healthcareprojectblob.blob.core.windows.net": "0izVsgjt8yavZcTHY3jKLiEiVe0Nu7jqnlYlYXCi3eAF/WsbNH1eY0Cvvd59kRgOIpunsgWDknLx+ASteulWxw=="
#   }
# )

# # List the files in the mounted DBFS path
# dbutils.fs.ls("/mnt/rawhealthdata")

In [0]:
# Unmount the Azure Blob Storage container
# dbutils.fs.unmount("/mnt/rawhealthdata")

In [0]:
data = spark.read.csv("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/Patient_records.csv", header=True, inferSchema=True)

In [0]:
display(data)

Patient_id,Patient_name,patient_gender,patient_birth_date,patient_phone,disease_name,city,hospital_id
187158,Harbir,Female,1924-06-30,+91 0112009318,Galactosemia,Rourkela,H1001
112766,Brahmdev,Female,1948-12-20,+91 1727749552,Bladder cancer,Tiruvottiyur,H1016
199252,Ujjawal,Male,1980-04-16,+91 8547451606,Kidney cancer,Berhampur,H1009
133424,Ballari,Female,1969-09-25,+91 0106026841,Suicide,Bihar Sharif,H1017
172579,Devnath,Female,1946-05-01,+91 1868774631,Food allergy,Bidhannagar,H1019
171320,Atasi,Male,1967-10-02,+91 9747336855,Whiplash,Amravati,H1013
107794,Manish,Male,1967-06-06,+91 4354294043,Sunbathing,Panvel,H1004
130339,Aakar,Female,1925-03-05,+91 2777633911,Drug consumption,Bihar Sharif,H1000
110377,Gurudas,Male,1945-05-06,+91 1232859381,Dengue,Kamarhati,H1001
149367,,Male,1925-06-12,+91 1780763280,Head banging,Bangalore,H1013


In [0]:
data.printSchema()

root
 |-- Patient_id: integer (nullable = true)
 |-- Patient_name: string (nullable = true)
 |-- patient_gender: string (nullable = true)
 |-- patient_birth_date: date (nullable = true)
 |-- patient_phone: string (nullable = true)
 |-- disease_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- hospital_id: string (nullable = true)



In [0]:
# to do - patient name, null values/check for duplicates.

In [0]:
data.columns

['Patient_id',
 'Patient_name',
 'patient_gender',
 'patient_birth_date',
 'patient_phone',
 'disease_name',
 'city',
 'hospital_id']

In [0]:
data.groupby(['Patient_id', 'Patient_name', 'patient_gender', 'patient_birth_date', 'patient_phone', 'disease_name', 'city', 'hospital_id']) \
    .count() \
    .where("count > 1").show(truncate=False) 

+----------+------------+--------------+------------------+-------------+------------+----+-----------+-----+
|Patient_id|Patient_name|patient_gender|patient_birth_date|patient_phone|disease_name|city|hospital_id|count|
+----------+------------+--------------+------------------+-------------+------------+----+-----------+-----+
+----------+------------+--------------+------------------+-------------+------------+----+-----------+-----+



In [0]:
# above output indicates there are no duplicates in any column

In [0]:
# Create a DataFrame with the count of null values for each column
null_counts = data.select(
    [count(when(isnull(col(c)) | col(c).isNull(), c)).alias(c) for c in data.columns]
)

# Display the DataFrame
display(null_counts)

Patient_id,Patient_name,patient_gender,patient_birth_date,patient_phone,disease_name,city,hospital_id
0,17,0,0,2,0,0,0


In [0]:
# We have null values in Patient_name and patient_phone columns
# We can replace null with N/A in patient_phone and Patient_name column

In [0]:
data = data.fillna({"patient_phone" : "NA", "Patient_name" : "Guest/NA"})

In [0]:
# check whether the null values are removed or not.
null_counts = data.select(
    [count(when(isnull(col(c)) | col(c).isNull(), c)).alias(c) for c in data.columns]
)

# Display the DataFrame
display(null_counts)

Patient_id,Patient_name,patient_gender,patient_birth_date,patient_phone,disease_name,city,hospital_id
0,0,0,0,0,0,0,0


In [0]:
# Null values are removed.
# Now we can export/store this cleaned data into stagging container.

In [0]:
data.select("*").filter(col("Patient_id").isin(["134184", "121783"])).show() 

+----------+------------+--------------+------------------+--------------+--------------+--------+-----------+
|Patient_id|Patient_name|patient_gender|patient_birth_date| patient_phone|  disease_name|    city|hospital_id|
+----------+------------+--------------+------------------+--------------+--------------+--------+-----------+
|    134184|     Prakash|        Female|        1923-09-15|+91 9268324471|           Flu|Kottayam|      H1001|
|    121783|     Paridhi|        Female|        1959-03-27|+91 2139280879|Bladder cancer|Jabalpur|      H1013|
+----------+------------+--------------+------------------+--------------+--------------+--------+-----------+



In [0]:
# # Now we need to write this data into stagging area for the next step.
# # boilerplate code (standard code for certain operation)

# Define the output staging path
output_stagging_path = "abfss://stagginglayerhealthdata@healthcareprojectblob.dfs.core.windows.net"

# Write the data to the staging area
data.coalesce(1).write.mode("append").format("com.databricks.spark.csv").option("header", "true").option("format", "csv").save(output_stagging_path)

# List all files in the output staging path
files = dbutils.fs.ls(output_stagging_path)

# Identify part files and non-part files
part_files = [x for x in files if x.name.startswith("part-")]
non_part_files = [x for x in files if x.name.startswith("_")]

# Move the part file to the desired location
if part_files:
    dbutils.fs.mv(part_files[0].path, f"{output_stagging_path}/patientstagging.csv")

# Remove non-part files
for file in non_part_files:
    dbutils.fs.rm(file.path)