In [72]:
import os
import sys
import yaml
import platform
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from PIL import Image
from pathlib import Path
from omegaconf import OmegaConf
from sklearn.preprocessing import StandardScaler

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Set plotting style
plt.style.use("seaborn-v0_8-paper")
sns.set_palette("husl")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 9

# Set random seed for reproducibility
np.random.seed(1)

# Suppress warnings
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

spark = ( SparkSession.builder
            .appName("CheXpert-EDA")
            .getOrCreate()
    )

print("Python version: {}".format(platform.python_version()))
print("Torch version: {}".format(torch.__version__))

Python version: 3.10.19
Torch version: 2.2.1+cu121


#### Data Extraction
##### Data Path `../data/raw/CheXpert`

In [76]:
class LoadDataAssests:
    def __init__(self, CONFIG_PATH: str = "../configs/EDA.yml"):

        self.config = self.load_config_file(CONFIG_PATH)
        # Accessing Paths
        self.csv_file_path = self.config.data.raw.CheXpert.csv_path
        self.img_data_path = self.config.data.raw.CheXpert.img_data

    def load_config_file(self, PATH: str):
        """ Load config file."""
        return OmegaConf.load(PATH)
    
    def get_csv_path(self):
        """Get CSV file path"""
        return self.csv_file_path

    def get_img_data_path(self):
        """Get Image data file path"""
        return self.img_data_path

CheXpert_data = LoadDataAssests()

# Load train CSV file
df_train = spark.read.csv(
    CheXpert_data.get_csv_path(),
    header=True,
    inferSchema=True
)

In [78]:
# Show sample data
df_train.show(5)

+--------------------+------+---+---------------+-----+----------+--------------------------+------------+------------+-----------+-----+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|                Path|   Sex|Age|Frontal/Lateral|AP/PA|No Finding|Enlarged Cardiomediastinum|Cardiomegaly|Lung Opacity|Lung Lesion|Edema|Consolidation|Pneumonia|Atelectasis|Pneumothorax|Pleural Effusion|Pleural Other|Fracture|Support Devices|
+--------------------+------+---+---------------+-----+----------+--------------------------+------------+------------+-----------+-----+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|CheXpert-v1.0-sma...|Female| 68|        Frontal|   AP|       1.0|                      NULL|        NULL|        NULL|       NULL| NULL|         NULL|     NULL|       NULL|         0.0|            NULL|         NULL|    NULL|            1.0|
|CheXpert-v1.0-sma...|Female

In [79]:
# Summary Statistics
df_train.describe().show()

26/02/18 16:24:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 11:>                                                         (0 + 1) / 1]

+-------+--------------------+-------+------------------+---------------+------+----------+--------------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+
|summary|                Path|    Sex|               Age|Frontal/Lateral| AP/PA|No Finding|Enlarged Cardiomediastinum|       Cardiomegaly|      Lung Opacity|       Lung Lesion|             Edema|       Consolidation|           Pneumonia|         Atelectasis|       Pneumothorax|  Pleural Effusion|      Pleural Other|          Fracture|   Support Devices|
+-------+--------------------+-------+------------------+---------------+------+----------+--------------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+--------------------+-------------------+----------

                                                                                