In [0]:
import pyspark
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='healthcarescope')]

In [0]:
dbutils.secrets.list(scope = 'healthcarescope')

[SecretMetadata(key='blobaccesskey')]

In [0]:

secret_name = dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey")

In [0]:
print(secret_name)

[REDACTED]


# I have used access key token to connect to the container.

In [0]:
spark.conf.set(
    "fs.azure.account.key.healthcareprojectblob.dfs.core.windows.net",
    dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey"))


In [0]:
display(dbutils.fs.ls("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/claims.csv,claims.csv,5766,1725646846000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/disease.csv,disease.csv,1489,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/group.csv,group.csv,4390,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1725312201000


In [0]:
# Importing the CSV file.
data = spark.read.csv("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subgroup.csv", header=True, inferSchema=True)

In [0]:
display(data)

subgrp_sk,subgrp_name,monthly_premium,subgrp_id
S101,Deficiency Diseases,3000,"GRP101,GRP105"
S102,Accident,1000,"GRP110,GRP150,GRP136"
S103,Physiology,2000,"GRP122,GRP108,GRP138,GRP148"
S104,Therapy,1500,"GRP103,GRP113,GRP123,GRP133,GRP143"
S105,Allergies,2300,"GRP153,GRP104,GRP114,GRP124"
S106,Self inflicted,1200,"GRP117,GRP127,GRP137,GRP147,GRP157"
S107,Cancer,3200,"GRP151,GRP131,GRP141,GRP121"
S108,Infectious disease,1500,"GRP130,GRP104,GRP109"
S109,Hereditary,2000,"GRP102,GRP112,GRP132,GRP142,GRP152"
S110,Viral,1000,"GRP143,GRP147,GRP126"


In [0]:
# check null value for subgrp_sk
data.filter("subgrp_sk IS NULL").count()

0

In [0]:
# OR
null_count = data.filter(data["subgrp_sk"].isNull()).show()
print(null_count)

+---------+-----------+---------------+---------+
|subgrp_sk|subgrp_name|monthly_premium|subgrp_id|
+---------+-----------+---------------+---------+
+---------+-----------+---------------+---------+

None


In [0]:
# check null value for all the columns
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+---------+-----------+---------------+---------+
|subgrp_sk|subgrp_name|monthly_premium|subgrp_id|
+---------+-----------+---------------+---------+
|        0|          0|              0|        0|
+---------+-----------+---------------+---------+



In [0]:
# Check the duplicates
data.groupby(['subgrp_sk','subgrp_name','monthly_premium','subgrp_id']).count().where("count > 1").show(15, False)

+---------+-----------+---------------+---------+-----+
|subgrp_sk|subgrp_name|monthly_premium|subgrp_id|count|
+---------+-----------+---------------+---------+-----+
+---------+-----------+---------------+---------+-----+



In [0]:
# To drop duplicates
data = data.dropDuplicates()

In [0]:
# data.show(5, False)

In [0]:
# Let's create seperate rows for each values in subgrp_id.
data = data.withColumn("subgrp_id", explode(split(data.subgrp_id, ",")))

In [0]:
data.select("*").filter(col("subgrp_sk").isin(["S107", "S110"])).show()

+---------+-----------+---------------+---------+
|subgrp_sk|subgrp_name|monthly_premium|subgrp_id|
+---------+-----------+---------------+---------+
|     S107|     Cancer|           3200|   GRP151|
|     S107|     Cancer|           3200|   GRP131|
|     S107|     Cancer|           3200|   GRP141|
|     S107|     Cancer|           3200|   GRP121|
|     S110|      Viral|           1000|   GRP143|
|     S110|      Viral|           1000|   GRP147|
|     S110|      Viral|           1000|   GRP126|
+---------+-----------+---------------+---------+



In [0]:
# Explode --> transforms each value in a list into a separate row.
# data.show(5, False)

In [0]:
# data.count(), len(data.columns)

In [0]:
# # Now we need to write this data into stagging area for the next step.
# # boilerplate code (standard code for certain operation)

# Define the output staging path
output_stagging_path = "abfss://stagginglayerhealthdata@healthcareprojectblob.dfs.core.windows.net"

# Write the data to the staging area
data.coalesce(1).write.mode("append").format("com.databricks.spark.csv").option("header", "true").option("format", "csv").save(output_stagging_path)

# List all files in the output staging path
files = dbutils.fs.ls(output_stagging_path)

# Identify part files and non-part files
part_files = [x for x in files if x.name.startswith("part-")]
non_part_files = [x for x in files if x.name.startswith("_")]

# Move the part file to the desired location
dbutils.fs.mv(part_files[0].path, f"{output_stagging_path}/subgroupstagging.csv")

# Remove non-part files
for file in non_part_files:
    dbutils.fs.rm(file.path)