In [1]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from pandas.plotting import scatter_matrix
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/09 16:21:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Inputs

In [2]:
train_path = pathlib.Path("healthcare/train_data.csv")

df = spark.read.csv(str(train_path), header=True, inferSchema=True)
df.head(3)

[Row(case_id=1, Hospital_code=8, Hospital_type_code='c', City_Code_Hospital=3, Hospital_region_code='Z', Available Extra Rooms in Hospital=3, Department='radiotherapy', Ward_Type='R', Ward_Facility_Code='F', Bed Grade=2.0, patientid=31397, City_Code_Patient=7.0, Type of Admission='Emergency', Severity of Illness='Extreme', Visitors with Patient=2, Age='51-60', Admission_Deposit=4911.0, Stay='0-10'),
 Row(case_id=2, Hospital_code=2, Hospital_type_code='c', City_Code_Hospital=5, Hospital_region_code='Z', Available Extra Rooms in Hospital=2, Department='radiotherapy', Ward_Type='S', Ward_Facility_Code='F', Bed Grade=2.0, patientid=31397, City_Code_Patient=7.0, Type of Admission='Trauma', Severity of Illness='Extreme', Visitors with Patient=2, Age='51-60', Admission_Deposit=5954.0, Stay='41-50'),
 Row(case_id=3, Hospital_code=10, Hospital_type_code='e', City_Code_Hospital=1, Hospital_region_code='X', Available Extra Rooms in Hospital=2, Department='anesthesia', Ward_Type='S', Ward_Facility

# Outputs

In [3]:
figure_path = pathlib.Path("figures")

univariate_figure_path = figure_path / "univariate_figure_path"
univariate_figure_path.mkdir(parents=True, exist_ok=True)

In [4]:
print(f"Training data columns:\n{df.columns}")

Training data columns:
['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code', 'Available Extra Rooms in Hospital', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade', 'patientid', 'City_Code_Patient', 'Type of Admission', 'Severity of Illness', 'Visitors with Patient', 'Age', 'Admission_Deposit', 'Stay']


# Null Count Distributions

In [5]:
null_count_per_row = df.select(
    sum(F.when(F.col(c).isNull(), 1).otherwise(0) for c in df.columns).alias(
        "null_count"
    )
)

In [6]:
number_distinct_null_counts = (
    null_count_per_row.groupBy("null_count").count().orderBy("null_count")
)

number_distinct_null_counts.head(5)

[Row(null_count=0, count=313793), Row(null_count=1, count=4645)]

In [7]:
null_count_per_column = df.select(
    [F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns]
)

In [8]:
null_count_per_column.head()

Row(case_id=0, Hospital_code=0, Hospital_type_code=0, City_Code_Hospital=0, Hospital_region_code=0, Available Extra Rooms in Hospital=0, Department=0, Ward_Type=0, Ward_Facility_Code=0, Bed Grade=113, patientid=0, City_Code_Patient=4532, Type of Admission=0, Severity of Illness=0, Visitors with Patient=0, Age=0, Admission_Deposit=0, Stay=0)

In [9]:
cols_with_nulls = [
    c
    for c in null_count_per_column.columns
    if null_count_per_column.filter(F.col(c) > 0).count() > 0
]

In [10]:
print(cols_with_nulls)

['Bed Grade', 'City_Code_Patient']


In [11]:
prop_null_cols = 100 * len(cols_with_nulls) / len(df.columns)
print(f"Percentage of columns with nulls = {prop_null_cols:.1f}%")

Percentage of columns with nulls = 11.1%
