In [None]:
# # data download

# import kagglehub

# # Download latest version
# covid_path = kagglehub.dataset_download("andyczhao/covidx-cxr2")

In [4]:
import pandas as pd

train_df = pd.read_csv("../../../.cache/kagglehub/datasets/andyczhao/covidx-cxr2/versions/9/train.txt", header=None, names=["Patient ID", "Path", "Label", "Data Source"], sep=" ")
val_df = pd.read_csv("../../../.cache/kagglehub/datasets/andyczhao/covidx-cxr2/versions/9/val.txt", header=None, names=["Patient ID", "Path", "Label", "Data Source"], sep=" ")
test_df = pd.read_csv("../../../.cache/kagglehub/datasets/andyczhao/covidx-cxr2/versions/9/test.txt", header=None, names=["Patient ID", "Path", "Label", "Data Source"], sep=" ")

for df in [train_df, val_df, test_df]:
    # for Label column, 1 if positive, 0 if negative
    df["Label"] = df["Label"].apply(lambda x: 1 if x == "positive" else 0)

train_df.to_csv("../data/covid_train_split.csv", index=False)
val_df.to_csv("../data/covid_val_split.csv", index=False)
test_df.to_csv("../data/covid_test_split.csv", index=False)

In [6]:
import pandas as pd
import numpy as np

def analyze_dataframe(df, path_col="Image Index"):
    print("========== BASIC INFO ==========")
    print("Total samples:", len(df))
    print("Columns:", list(df.columns))
    print()

    if path_col in df.columns:
        print("========== PATH COLUMN CHECK ==========")
        print("Missing paths:", df[path_col].isna().sum())
        print("Unique paths:", df[path_col].nunique())
        print("Example path:", df[path_col].iloc[0])
        print()

    # Identify label columns (everything except path)
    label_cols = [c for c in df.columns if c != path_col]
    print("Label columns:", label_cols)
    print()

    print("========== LABEL SUMMARY ==========")
    print(df[label_cols].describe(include="all"))
    print()

    print("========== NULL VALUES ==========")
    print(df[label_cols].isna().sum())
    print()

    print("========== VALUE COUNTS PER LABEL ==========")
    for col in label_cols:
        print(f"\n--- {col} ---")
        counts = df[col].value_counts(dropna=False)
        print(counts)
        print("Positives (%):", round((df[col] == 1).mean() * 100, 3))
        print("Negatives (%):", round((df[col] == 0).mean() * 100, 3))
        if (-1 in counts.index):
            print("Uncertain (%):", round((df[col] == -1).mean() * 100, 3))
    print()

    print("========== CLASS IMBALANCE RATIO ==========")
    imbalance = (df[label_cols] == 1).sum() / (df[label_cols] == 0).sum()
    print(imbalance)
    print()

    print("========== TOTAL POSITIVES PER LABEL ==========")
    print((df[label_cols] == 1).sum())
    print()

    print("========== TOTAL NEGATIVES PER LABEL ==========")
    print((df[label_cols] == 0).sum())
    print()

    if (df[label_cols] == -1).any().any():
        print("========== TOTAL UNCERTAIN (-1) PER LABEL ==========")
        print((df[label_cols] == -1).sum())
        print()

    print("========== MULTI-LABEL STATS ==========")
    df["num_labels_positive"] = (df[label_cols] == 1).sum(axis=1)
    print(df["num_labels_positive"].describe())
    print("Images with ≥1 positive label:", (df["num_labels_positive"] >= 1).sum())
    print("Images with 0 positive labels:", (df["num_labels_positive"] == 0).sum())
    print()

    print("========== CO-OCCURRENCE: LABELS PER IMAGE ==========")
    print("Histogram of positive label counts:")
    print(df["num_labels_positive"].value_counts().sort_index())
    print()

    print("========== SAMPLE ROWS ==========")
    print(df.head(5))
    print()

    print("========== DONE ==========")
    
analyze_dataframe(train_df, path_col="Path")

Total samples: 67863
Columns: ['Patient ID', 'Path', 'Label', 'Data Source', 'num_labels_positive']

Missing paths: 0
Unique paths: 67863
Example path: 1e64990d1b40c1758a2aaa9c7f7a85_jumbo.jpeg

Label columns: ['Patient ID', 'Label', 'Data Source', 'num_labels_positive']

       Patient ID         Label Data Source  num_labels_positive
count       67863  67863.000000       67863         67863.000000
unique      35457           NaN           8                  NaN
top       A860070           NaN       bimcv                  NaN
freq          236           NaN       43142                  NaN
mean          NaN      0.842860         NaN             0.842860
std           NaN      0.363936         NaN             0.363936
min           NaN      0.000000         NaN             0.000000
25%           NaN      1.000000         NaN             1.000000
50%           NaN      1.000000         NaN             1.000000
75%           NaN      1.000000         NaN             1.000000
max         

In [7]:
analyze_dataframe(val_df, path_col="Path")

Total samples: 8473
Columns: ['Patient ID', 'Path', 'Label', 'Data Source']

Missing paths: 0
Unique paths: 8473
Example path: CR.1.2.840.113564.1722810170.20200318082923328940.1003000225002.png

Label columns: ['Patient ID', 'Label', 'Data Source']

       Patient ID        Label Data Source
count        8473  8473.000000        8473
unique       5163          NaN           8
top       A958650          NaN        rsna
freq          122          NaN        3869
mean          NaN     0.500531         NaN
std           NaN     0.500029         NaN
min           NaN     0.000000         NaN
25%           NaN     0.000000         NaN
50%           NaN     1.000000         NaN
75%           NaN     1.000000         NaN
max           NaN     1.000000         NaN

Patient ID     0
Label          0
Data Source    0
dtype: int64


--- Patient ID ---
Patient ID
A958650                                 122
A767576                                  69
A793505                                  51
A244

In [8]:
analyze_dataframe(test_df, path_col="Path")

Total samples: 8482
Columns: ['Patient ID', 'Path', 'Label', 'Data Source']

Missing paths: 0
Unique paths: 8482
Example path: MIDRC-RICORD-1C-419639-003251-46647-0.png

Label columns: ['Patient ID', 'Label', 'Data Source']

       Patient ID        Label Data Source
count        8482  8482.000000        8482
unique       4722          NaN           3
top        S09382          NaN       bimcv
freq           38          NaN        8082
mean          NaN     0.500000         NaN
std           NaN     0.500029         NaN
min           NaN     0.000000         NaN
25%           NaN     0.000000         NaN
50%           NaN     0.500000         NaN
75%           NaN     1.000000         NaN
max           NaN     1.000000         NaN

Patient ID     0
Label          0
Data Source    0
dtype: int64


--- Patient ID ---
Patient ID
S09382           38
S09371           35
S09340           30
S09348           29
S09426           28
                 ..
SITE2-000147      1
419639-002601     1
41