In [143]:
import os
import sys
import yaml
import platform
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from PIL import Image
import cv2
from pathlib import Path
from omegaconf import OmegaConf
from sklearn.preprocessing import StandardScaler

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from functools import reduce
from operator import or_

# Set plotting style
plt.style.use("seaborn-v0_8-paper")
sns.set_palette("husl")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 9

# Set random seed for reproducibility
np.random.seed(1)

# Suppress warnings
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

spark = ( SparkSession.builder
            .appName("CheXpert-EDA")
            .getOrCreate()
    )

print("Python version: {}".format(platform.python_version()))
print("Torch version: {}".format(torch.__version__))
print("OpenCV version %s" %(cv2.__version__))

Python version: 3.10.19
Torch version: 2.2.1+cu121
OpenCV version 4.13.0


### What is CheXpert?
CheXpert is a large dataset of `chest X-rays` and competition for automated chext x-ray interpretation, which features uncertainty and radiologist-labeled reference standard evaluation sets.

### Why CheXpert?
Chest radiography is the most common imaging examination globally, critical for `screening`, `diagnosis`, and `management of many life threatening diseases`. Automated chest radiograph interpretation at the level of practicing radiologists could provide substantial benefit in many medical settings, from improved workflow prioritization and clinical decision support to large-scale screening and global population health initiatives. For progress in both development and validation of automated algorithms, we realized there was a need for a labeled dataset that (1) was large, (2) had strong reference standards, and (3) provided expert human performance metrics for comparison (`stanfordmlgroup.github.io/competitions/chexpert`)

In [86]:
"""
Load raw data for EDA
"""
class LoadDataAssests:
    """Loads both the CSV and the associated img data"""
    def __init__(self, CONFIG_PATH: str = "../configs/EDA.yml"):

        self.config = self.load_config_file(CONFIG_PATH)
        # Accessing Paths
        self.csv_file_path = self.config.data.raw.CheXpert.csv_train
        self.img_data_path = self.config.data.raw.CheXpert.img_data

    def load_config_file(self, PATH: str):
        """ Load config file."""
        return OmegaConf.load(PATH)
    
    def get_csv_path(self):
        """Get CSV file path"""
        return self.csv_file_path

    def get_img_data_path(self):
        """Get Image data file path"""
        return self.img_data_path

CheXpert_data = LoadDataAssests()

# Load train CSV file
df_train = spark.read.csv(
    CheXpert_data.get_csv_path(),
    header=True,
    inferSchema=True
)

In [90]:
# Show sample data
df_train.show(5, truncate=False)

+---------------------------------------------------------------+------+---+---------------+-----+----------+--------------------------+------------+------------+-----------+-----+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|Path                                                           |Sex   |Age|Frontal/Lateral|AP/PA|No Finding|Enlarged Cardiomediastinum|Cardiomegaly|Lung Opacity|Lung Lesion|Edema|Consolidation|Pneumonia|Atelectasis|Pneumothorax|Pleural Effusion|Pleural Other|Fracture|Support Devices|
+---------------------------------------------------------------+------+---+---------------+-----+----------+--------------------------+------------+------------+-----------+-----+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg|Female|68 |Frontal        |AP   |1.0       |NULL                      |NULL  

In [89]:
# Summary Statistics
df_train.describe().show(5, truncate=False)

[Stage 21:>                                                         (0 + 6) / 6]

+-------+---------------------------------------------------------------+-------+------------------+---------------+------+----------+--------------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+---------------------+-------------------+------------------+-------------------+------------------+------------------+
|summary|Path                                                           |Sex    |Age               |Frontal/Lateral|AP/PA |No Finding|Enlarged Cardiomediastinum|Cardiomegaly       |Lung Opacity      |Lung Lesion       |Edema             |Consolidation       |Pneumonia           |Atelectasis          |Pneumothorax       |Pleural Effusion  |Pleural Other      |Fracture          |Support Devices   |
+-------+---------------------------------------------------------------+-------+------------------+---------------+------+----------+--------------------------+-------------------+------------------+

                                                                                

In [80]:
# Train data columns
df_train.printSchema()

root
 |-- Path: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Frontal/Lateral: string (nullable = true)
 |-- AP/PA: string (nullable = true)
 |-- No Finding: double (nullable = true)
 |-- Enlarged Cardiomediastinum: double (nullable = true)
 |-- Cardiomegaly: double (nullable = true)
 |-- Lung Opacity: double (nullable = true)
 |-- Lung Lesion: double (nullable = true)
 |-- Edema: double (nullable = true)
 |-- Consolidation: double (nullable = true)
 |-- Pneumonia: double (nullable = true)
 |-- Atelectasis: double (nullable = true)
 |-- Pneumothorax: double (nullable = true)
 |-- Pleural Effusion: double (nullable = true)
 |-- Pleural Other: double (nullable = true)
 |-- Fracture: double (nullable = true)
 |-- Support Devices: double (nullable = true)



In [92]:
# Count entries in the dataset
df_train.count()

223414

In [None]:
# Null counts per column
null_counts = df_train.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_train.columns
]).show(truncate=False)

+----+---+---+---------------+-----+----------+--------------------------+------------+------------+-----------+------+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|Path|Sex|Age|Frontal/Lateral|AP/PA|No Finding|Enlarged Cardiomediastinum|Cardiomegaly|Lung Opacity|Lung Lesion|Edema |Consolidation|Pneumonia|Atelectasis|Pneumothorax|Pleural Effusion|Pleural Other|Fracture|Support Devices|
+----+---+---+---------------+-----+----------+--------------------------+------------+------------+-----------+------+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|0   |0  |0  |0              |32387|201033    |178575                    |177211      |105636      |211470     |137458|152792       |195806   |154971     |144480      |90203           |216922       |211220  |100197         |
+----+---+---+---------------+-----+----------+--------------------------+------------+------------+

In [105]:
# Distinct Values for Categorical columns
for c in ["Sex", "Frontal/Lateral", "AP/PA"]:
    df_train.select(c).distinct().show(truncate=False)

+-------+
|Sex    |
+-------+
|Female |
|Male   |
|Unknown|
+-------+

+---------------+
|Frontal/Lateral|
+---------------+
|Lateral        |
|Frontal        |
+---------------+

+-----+
|AP/PA|
+-----+
|LL   |
|PA   |
|AP   |
|RL   |
|NULL |
+-----+



#### Label semantics exploration
CheXpert labels are:
+ 1.0 -> `positive`
+ 0.0 -> `negative`
+ -1.0 -> `uncertain`
+ null -> `not mentioned`

In [108]:
# Frequency distribution per label
## Define label columns

label_cols = [
    "No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion", "Edema", "Consolidation",
    "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", "Pleural Other", "Fracture", "Support Devices"
]

for label in label_cols:
    print("\n{}".format(label))
    df_train.groupBy(label).count().orderBy(label).show(truncate=False)


No Finding
+----------+------+
|No Finding|count |
+----------+------+
|NULL      |201033|
|1.0       |22381 |
+----------+------+


Enlarged Cardiomediastinum
+--------------------------+------+
|Enlarged Cardiomediastinum|count |
+--------------------------+------+
|NULL                      |178575|
|-1.0                      |12403 |
|0.0                       |21638 |
|1.0                       |10798 |
+--------------------------+------+


Cardiomegaly
+------------+------+
|Cardiomegaly|count |
+------------+------+
|NULL        |177211|
|-1.0        |8087  |
|0.0         |11116 |
|1.0         |27000 |
+------------+------+


Lung Opacity
+------------+------+
|Lung Opacity|count |
+------------+------+
|NULL        |105636|
|-1.0        |5598  |
|0.0         |6599  |
|1.0         |105581|
+------------+------+


Lung Lesion
+-----------+------+
|Lung Lesion|count |
+-----------+------+
|NULL       |211470|
|-1.0       |1488  |
|0.0        |1270  |
|1.0        |9186  |
+-------

In [110]:
# Positive Prevalence %
total = df_train.count()

for label in label_cols:
    positive = df_train.filter(F.col(label) == 1.0).count()
    print(label, positive / total)

No Finding 0.10017724941140663
Enlarged Cardiomediastinum 0.048331796574968444
Cardiomegaly 0.12085187141360881
Lung Opacity 0.47258005317482343
Lung Lesion 0.041116492252052245
Edema 0.2338528471805706
Consolidation 0.0661686375965696
Pneumonia 0.027030535239510507
Atelectasis 0.14939081704817067
Pneumothorax 0.08704915537969868
Pleural Effusion 0.38577260153795195
Pleural Other 0.01576893122185718
Fracture 0.040462996947371245
Support Devices 0.5192199235500013


In [112]:
# Uncertainty ratio per label
for label in label_cols:
    uncertain = df_train.filter(F.col(label) == -1.0).count()
    print(label, uncertain / total)

No Finding 0.0
Enlarged Cardiomediastinum 0.05551576893122186
Cardiomegaly 0.03619737348599461
Lung Opacity 0.025056621339754895
Lung Lesion 0.006660280913461108
Edema 0.058116322164233215
Consolidation 0.12417305987986428
Pneumonia 0.08401443060864583
Atelectasis 0.15101560331939806
Pneumothorax 0.014077005022066656
Pleural Effusion 0.05204687262212753
Pleural Other 0.011874815365196452
Fracture 0.002873588942501365
Support Devices 0.0048295988613068115


In [114]:
# Multi label density per image
positive_sum_expr = " + ".join([
    f"IF(`{l}`= 1.0, 1, 0)" for l in label_cols 
])

df_train = df_train.withColumn(
    "num_positive", F.expr(positive_sum_expr)
)

df_train.groupBy("num_positive").count()

DataFrame[num_positive: int, count: bigint]

In [118]:
# Co-occurrence (Edema & Cardiomegaly)
df_train.filter(
    (F.col("Edema") == 1.0) & (F.col("Cardiomegaly") == 1.0)
).count()

11659

Demographic Confounders

In [123]:
df_train.groupBy("Age").count().orderBy("Age").show(truncate=False)

+---+-----+
|Age|count|
+---+-----+
|0  |3    |
|18 |766  |
|19 |1167 |
|20 |1287 |
|21 |1279 |
|22 |1370 |
|23 |1229 |
|24 |1388 |
|25 |1326 |
|26 |1338 |
|27 |1535 |
|28 |1344 |
|29 |1312 |
|30 |1637 |
|31 |1493 |
|32 |1327 |
|33 |1400 |
|34 |1415 |
|35 |1534 |
|36 |1549 |
+---+-----+
only showing top 20 rows


In [124]:
df_train.select("Age").describe().show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|            223414|
|   mean|60.430653405784774|
| stddev|  17.8209246640007|
|    min|                 0|
|    max|                90|
+-------+------------------+



In [129]:
# Disease prevalence by sex
df_train.groupBy("Sex").agg(
    F.sum(F.when(F.col("Pneumothorax") == 1.0, 1).otherwise(0)).alias("positive")
).show(truncate=False)

+-------+--------+
|Sex    |positive|
+-------+--------+
|Female |7503    |
|Male   |11945   |
|Unknown|0       |
+-------+--------+



In [131]:
# Distribution of views
df_train.groupBy("Frontal/Lateral").count().show()
df_train.groupBy("AP/PA").count().show()

+---------------+------+
|Frontal/Lateral| count|
+---------------+------+
|        Lateral| 32387|
|        Frontal|191027|
+---------------+------+

+-----+------+
|AP/PA| count|
+-----+------+
| NULL| 32387|
|   LL|    16|
|   PA| 29420|
|   AP|161590|
|   RL|     1|
+-----+------+



In [132]:
# Disease prevalence by view
df_train.groupBy("AP/PA").agg(
    F.sum(F.when(F.col("Edema") == 1.0, 1).otherwise(0)).alias("edema_pos")
).show()

+-----+---------+
|AP/PA|edema_pos|
+-----+---------+
| NULL|     2571|
|   LL|        1|
|   PA|     1709|
|   AP|    47965|
|   RL|        0|
+-----+---------+



In [145]:
""" No finding `Semantics Check` """

# Create a list of boolean expressions
other_labels = [(F.col(l) == 1.0) for l in label_cols if l != "No Finding"]

df_train.filter(
    (F.col("No Finding") == 1.0) & 
    reduce(or_, other_labels)
).count()

8808

In [150]:
# Path column inspection
df_train = df_train.withColumn("patient_id", F.split(F.col("Path"), "/").getItem(1))
df_train.groupBy("patient_id").count().orderBy("count", ascending=False).show()

+----------+------+
|patient_id| count|
+----------+------+
|     train|223414|
+----------+------+

