# Hospital Readmission Data Analysis
This notebook performs data cleaning and exploratory analysis on hospital readmission data using PySpark.

In [None]:
# Initialize Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("HospitalReadmissionAnalysis").getOrCreate()

In [None]:
# Load raw data from ADLS (replace with actual path)
raw_path = "abfss://<container>@<storage_account>.dfs.core.windows.net/hospital-readmission/raw/patient_data.csv"
df_raw = spark.read.option("header", True).csv(raw_path)
df_raw.printSchema()
df_raw.show(5)

In [None]:
# Data cleaning: drop nulls, cast types, normalize columns
from pyspark.sql.functions import col

# Drop rows with nulls in key columns
df_clean = df_raw.dropna(subset=["readmitted", "age", "gender", "diag_1"])

# Cast numerical columns
df_clean = df_clean.withColumn("num_lab_procedures", col("num_lab_procedures").cast("int"))
df_clean = df_clean.withColumn("time_in_hospital", col("time_in_hospital").cast("int"))

In [None]:
# Save cleaned data to ADLS (replace with actual path)
processed_path = "abfss://<container>@<storage_account>.dfs.core.windows.net/hospital-readmission/processed/cleaned_data.parquet"
df_clean.write.mode("overwrite").parquet(processed_path)

In [None]:
# Exploratory analysis: readmission rate by age group
df_clean.groupBy("age").count().orderBy("age").show()

In [None]:
# Readmission rate by diagnosis
df_clean.groupBy("diag_1", "readmitted").count().orderBy("count", ascending=False).show(10)