In [0]:
import pyspark
from pyspark.sql.functions import *

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='healthcarescope')]

In [0]:
dbutils.secrets.list(scope = 'healthcarescope')

[SecretMetadata(key='blobaccesskey')]

In [0]:

secret_name = dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey")

In [0]:
print(secret_name)

[REDACTED]


In [0]:
spark.conf.set(
    "fs.azure.account.key.healthcareprojectblob.dfs.core.windows.net",
    dbutils.secrets.get(scope = "healthcarescope", key = "blobaccesskey"))


In [0]:
display(dbutils.fs.ls("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net"))

path,name,size,modificationTime
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/Patient_records.csv,Patient_records.csv,5110,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/claims.csv,claims.csv,5766,1725461306000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/disease.csv,disease.csv,1489,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/group.csv,group.csv,4390,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/hospital.csv,hospital.csv,1328,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subgroup.csv,subgroup.csv,561,1725312200000
abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/subscriber.csv,subscriber.csv,12061,1725312201000


In [0]:
data = spark.read.csv("abfss://rawhealthdata@healthcareprojectblob.dfs.core.windows.net/group.csv", header=True)

In [0]:
display(data)

country,premium_written,zip_code,grp_id,grp_name,grp_type,city
India,72000,482018,GRP101,Life Insurance Corporation of India,Govt.,Mumbai
India,45000,482049,GRP102,HDFC Standard Life Insurance Co. Ltd.,Private,Mumbai
India,64000,482030,GRP103,Max Life Insurance Co. Ltd.,Private,Delhi
India,59000,482028,GRP104,ICICI Prudential Life Insurance Co. Ltd.,Private,Mumbai
India,37000,482014,GRP105,Kotak Mahindra Life Insurance Co. Ltd.,Private,Mumbai
India,89000,482011,GRP106,Aditya Birla Sun Life Insurance Co. Ltd.,Private,Mumbai
India,70000,482006,GRP107,TATA AIG Life Insurance Co. Ltd.,Private,Mumbai
India,52000,482034,GRP108,SBI Life Insurance Co. Ltd.,Private,Mumbai
India,78000,482032,GRP109,Exide Life Insurance Co. Ltd.,Private,Bangalore
India,48000,482015,GRP110,Bajaj Allianz Life Insurance Co. Ltd.,Private,Pune


In [0]:
# to do - patient name, null values/check for duplicates.
data.columns

['country',
 'premium_written',
 'zip_code',
 'grp_id',
 'grp_name',
 'grp_type',
 'city']

In [0]:
data.groupby(['country', 'premium_written', 'zip_code', 'grp_id', 'grp_name', 'grp_type', 'city']).count().where("count > 1").show(15, False)

+-------+---------------+--------+------+--------+--------+----+-----+
|country|premium_written|zip_code|grp_id|grp_name|grp_type|city|count|
+-------+---------------+--------+------+--------+--------+----+-----+
+-------+---------------+--------+------+--------+--------+----+-----+



In [0]:
data = data.dropDuplicates()

In [0]:
# check null value for all the columns
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+-------+---------------+--------+------+--------+--------+----+
|country|premium_written|zip_code|grp_id|grp_name|grp_type|city|
+-------+---------------+--------+------+--------+--------+----+
|      0|              0|       0|     0|       0|       0|   0|
+-------+---------------+--------+------+--------+--------+----+



In [0]:
# # Now we need to write this data into stagging area for the next step.
# # boilerplate code (standard code for certain operation)

# Define the output staging path
output_stagging_path = "abfss://stagginglayerhealthdata@healthcareprojectblob.dfs.core.windows.net"

# Write the data to the staging area
data.coalesce(1).write.mode("append").format("com.databricks.spark.csv").option("header", "true").option("format", "csv").save(output_stagging_path)

# List all files in the output staging path
files = dbutils.fs.ls(output_stagging_path)

# Identify part files and non-part files
part_files = [x for x in files if x.name.startswith("part-")]
non_part_files = [x for x in files if x.name.startswith("_")]

# Move the part file to the desired location
if part_files:
    dbutils.fs.mv(part_files[0].path, f"{output_stagging_path}/groupstagging.csv")

# Remove non-part files
for file in non_part_files:
    dbutils.fs.rm(file.path)