In [143]:
import os
import sys
import yaml
import platform
import numpy as np
import pandas as pd
from IPython.display import display
import torch
from matplotlib import pyplot as plt
import seaborn as sns

from PIL import Image
import cv2
from pathlib import Path
from omegaconf import OmegaConf
from sklearn.preprocessing import StandardScaler

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from functools import reduce
from operator import or_

# Set plotting style
plt.style.use("seaborn-v0_8-paper")
sns.set_palette("husl")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 9

# Set random seed for reproducibility
np.random.seed(1)

# Suppress warnings
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

spark = ( SparkSession.builder
            .appName("CheXpert-EDA")
            .getOrCreate()
    )

print("Python version: {}".format(platform.python_version()))
print("Torch version: {}".format(torch.__version__))
print("OpenCV version %s" %(cv2.__version__))

Python version: 3.10.19
Torch version: 2.2.1+cu121
OpenCV version 4.13.0


### What is CheXpert?
CheXpert is a large dataset of `chest X-rays` and competition for automated chext x-ray interpretation, which features uncertainty and radiologist-labeled reference standard evaluation sets.

### Why CheXpert?
Chest radiography is the most common imaging examination globally, critical for `screening`, `diagnosis`, and `management of many life threatening diseases`. Automated chest radiograph interpretation at the level of practicing radiologists could provide substantial benefit in many medical settings, from improved workflow prioritization and clinical decision support to large-scale screening and global population health initiatives. For progress in both development and validation of automated algorithms, we realized there was a need for a labeled dataset that (1) was large, (2) had strong reference standards, and (3) provided expert human performance metrics for comparison (`stanfordmlgroup.github.io/competitions/chexpert`)

In [86]:
"""
Load raw data for EDA
"""
class LoadDataAssests:
    """Loads both the CSV and the associated img data"""
    def __init__(self, CONFIG_PATH: str = "../configs/EDA.yml"):

        self.config = self.load_config_file(CONFIG_PATH)
        # Accessing Paths
        self.csv_file_path = self.config.data.raw.CheXpert.csv_train
        self.img_data_path = self.config.data.raw.CheXpert.img_data

    def load_config_file(self, PATH: str):
        """ Load config file."""
        return OmegaConf.load(PATH)
    
    def get_csv_path(self):
        """Get CSV file path"""
        return self.csv_file_path

    def get_img_data_path(self):
        """Get Image data file path"""
        return self.img_data_path

CheXpert_data = LoadDataAssests()

# Load train CSV file
df_train = spark.read.csv(
    CheXpert_data.get_csv_path(),
    header=True,
    inferSchema=True
)

In [90]:
# Show sample data
df_train.show(5, truncate=False)

+---------------------------------------------------------------+------+---+---------------+-----+----------+--------------------------+------------+------------+-----------+-----+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|Path                                                           |Sex   |Age|Frontal/Lateral|AP/PA|No Finding|Enlarged Cardiomediastinum|Cardiomegaly|Lung Opacity|Lung Lesion|Edema|Consolidation|Pneumonia|Atelectasis|Pneumothorax|Pleural Effusion|Pleural Other|Fracture|Support Devices|
+---------------------------------------------------------------+------+---+---------------+-----+----------+--------------------------+------------+------------+-----------+-----+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg|Female|68 |Frontal        |AP   |1.0       |NULL                      |NULL  

In [89]:
# Summary Statistics
df_train.describe().show(5, truncate=False)

[Stage 21:>                                                         (0 + 6) / 6]

+-------+---------------------------------------------------------------+-------+------------------+---------------+------+----------+--------------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+---------------------+-------------------+------------------+-------------------+------------------+------------------+
|summary|Path                                                           |Sex    |Age               |Frontal/Lateral|AP/PA |No Finding|Enlarged Cardiomediastinum|Cardiomegaly       |Lung Opacity      |Lung Lesion       |Edema             |Consolidation       |Pneumonia           |Atelectasis          |Pneumothorax       |Pleural Effusion  |Pleural Other      |Fracture          |Support Devices   |
+-------+---------------------------------------------------------------+-------+------------------+---------------+------+----------+--------------------------+-------------------+------------------+

                                                                                

In [80]:
# Train data columns
df_train.printSchema()

root
 |-- Path: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Frontal/Lateral: string (nullable = true)
 |-- AP/PA: string (nullable = true)
 |-- No Finding: double (nullable = true)
 |-- Enlarged Cardiomediastinum: double (nullable = true)
 |-- Cardiomegaly: double (nullable = true)
 |-- Lung Opacity: double (nullable = true)
 |-- Lung Lesion: double (nullable = true)
 |-- Edema: double (nullable = true)
 |-- Consolidation: double (nullable = true)
 |-- Pneumonia: double (nullable = true)
 |-- Atelectasis: double (nullable = true)
 |-- Pneumothorax: double (nullable = true)
 |-- Pleural Effusion: double (nullable = true)
 |-- Pleural Other: double (nullable = true)
 |-- Fracture: double (nullable = true)
 |-- Support Devices: double (nullable = true)



In [92]:
# Count entries in the dataset
df_train.count()

223414

In [None]:
# Null counts per column
null_counts = df_train.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_train.columns
]).show(truncate=False)

+----+---+---+---------------+-----+----------+--------------------------+------------+------------+-----------+------+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|Path|Sex|Age|Frontal/Lateral|AP/PA|No Finding|Enlarged Cardiomediastinum|Cardiomegaly|Lung Opacity|Lung Lesion|Edema |Consolidation|Pneumonia|Atelectasis|Pneumothorax|Pleural Effusion|Pleural Other|Fracture|Support Devices|
+----+---+---+---------------+-----+----------+--------------------------+------------+------------+-----------+------+-------------+---------+-----------+------------+----------------+-------------+--------+---------------+
|0   |0  |0  |0              |32387|201033    |178575                    |177211      |105636      |211470     |137458|152792       |195806   |154971     |144480      |90203           |216922       |211220  |100197         |
+----+---+---+---------------+-----+----------+--------------------------+------------+------------+

In [105]:
# Distinct Values for Categorical columns
for c in ["Sex", "Frontal/Lateral", "AP/PA"]:
    df_train.select(c).distinct().show(truncate=False)

+-------+
|Sex    |
+-------+
|Female |
|Male   |
|Unknown|
+-------+

+---------------+
|Frontal/Lateral|
+---------------+
|Lateral        |
|Frontal        |
+---------------+

+-----+
|AP/PA|
+-----+
|LL   |
|PA   |
|AP   |
|RL   |
|NULL |
+-----+



#### Label semantics exploration
CheXpert labels are:
+ 1.0 -> `positive`
+ 0.0 -> `negative`
+ -1.0 -> `uncertain`
+ null -> `not mentioned`

In [108]:
# Frequency distribution per label
## Define label columns

label_cols = [
    "No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion", "Edema", "Consolidation",
    "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", "Pleural Other", "Fracture", "Support Devices"
]

for label in label_cols:
    print("\n{}".format(label))
    df_train.groupBy(label).count().orderBy(label).show(truncate=False)


No Finding
+----------+------+
|No Finding|count |
+----------+------+
|NULL      |201033|
|1.0       |22381 |
+----------+------+


Enlarged Cardiomediastinum
+--------------------------+------+
|Enlarged Cardiomediastinum|count |
+--------------------------+------+
|NULL                      |178575|
|-1.0                      |12403 |
|0.0                       |21638 |
|1.0                       |10798 |
+--------------------------+------+


Cardiomegaly
+------------+------+
|Cardiomegaly|count |
+------------+------+
|NULL        |177211|
|-1.0        |8087  |
|0.0         |11116 |
|1.0         |27000 |
+------------+------+


Lung Opacity
+------------+------+
|Lung Opacity|count |
+------------+------+
|NULL        |105636|
|-1.0        |5598  |
|0.0         |6599  |
|1.0         |105581|
+------------+------+


Lung Lesion
+-----------+------+
|Lung Lesion|count |
+-----------+------+
|NULL       |211470|
|-1.0       |1488  |
|0.0        |1270  |
|1.0        |9186  |
+-------

In [110]:
# Positive Prevalence %
total = df_train.count()

for label in label_cols:
    positive = df_train.filter(F.col(label) == 1.0).count()
    print(label, positive / total)

No Finding 0.10017724941140663
Enlarged Cardiomediastinum 0.048331796574968444
Cardiomegaly 0.12085187141360881
Lung Opacity 0.47258005317482343
Lung Lesion 0.041116492252052245
Edema 0.2338528471805706
Consolidation 0.0661686375965696
Pneumonia 0.027030535239510507
Atelectasis 0.14939081704817067
Pneumothorax 0.08704915537969868
Pleural Effusion 0.38577260153795195
Pleural Other 0.01576893122185718
Fracture 0.040462996947371245
Support Devices 0.5192199235500013


In [112]:
# Uncertainty ratio per label
for label in label_cols:
    uncertain = df_train.filter(F.col(label) == -1.0).count()
    print(label, uncertain / total)

No Finding 0.0
Enlarged Cardiomediastinum 0.05551576893122186
Cardiomegaly 0.03619737348599461
Lung Opacity 0.025056621339754895
Lung Lesion 0.006660280913461108
Edema 0.058116322164233215
Consolidation 0.12417305987986428
Pneumonia 0.08401443060864583
Atelectasis 0.15101560331939806
Pneumothorax 0.014077005022066656
Pleural Effusion 0.05204687262212753
Pleural Other 0.011874815365196452
Fracture 0.002873588942501365
Support Devices 0.0048295988613068115


In [114]:
# Multi label density per image
positive_sum_expr = " + ".join([
    f"IF(`{l}`= 1.0, 1, 0)" for l in label_cols 
])

df_train = df_train.withColumn(
    "num_positive", F.expr(positive_sum_expr)
)

df_train.groupBy("num_positive").count()

DataFrame[num_positive: int, count: bigint]

In [118]:
# Co-occurrence (Edema & Cardiomegaly)
df_train.filter(
    (F.col("Edema") == 1.0) & (F.col("Cardiomegaly") == 1.0)
).count()

11659

Demographic Confounders

In [123]:
df_train.groupBy("Age").count().orderBy("Age").show(truncate=False)

+---+-----+
|Age|count|
+---+-----+
|0  |3    |
|18 |766  |
|19 |1167 |
|20 |1287 |
|21 |1279 |
|22 |1370 |
|23 |1229 |
|24 |1388 |
|25 |1326 |
|26 |1338 |
|27 |1535 |
|28 |1344 |
|29 |1312 |
|30 |1637 |
|31 |1493 |
|32 |1327 |
|33 |1400 |
|34 |1415 |
|35 |1534 |
|36 |1549 |
+---+-----+
only showing top 20 rows


In [124]:
df_train.select("Age").describe().show()

+-------+------------------+
|summary|               Age|
+-------+------------------+
|  count|            223414|
|   mean|60.430653405784774|
| stddev|  17.8209246640007|
|    min|                 0|
|    max|                90|
+-------+------------------+



In [129]:
# Disease prevalence by sex
df_train.groupBy("Sex").agg(
    F.sum(F.when(F.col("Pneumothorax") == 1.0, 1).otherwise(0)).alias("positive")
).show(truncate=False)

+-------+--------+
|Sex    |positive|
+-------+--------+
|Female |7503    |
|Male   |11945   |
|Unknown|0       |
+-------+--------+



In [131]:
# Distribution of views
df_train.groupBy("Frontal/Lateral").count().show()
df_train.groupBy("AP/PA").count().show()

+---------------+------+
|Frontal/Lateral| count|
+---------------+------+
|        Lateral| 32387|
|        Frontal|191027|
+---------------+------+

+-----+------+
|AP/PA| count|
+-----+------+
| NULL| 32387|
|   LL|    16|
|   PA| 29420|
|   AP|161590|
|   RL|     1|
+-----+------+



In [132]:
# Disease prevalence by view
df_train.groupBy("AP/PA").agg(
    F.sum(F.when(F.col("Edema") == 1.0, 1).otherwise(0)).alias("edema_pos")
).show()

+-----+---------+
|AP/PA|edema_pos|
+-----+---------+
| NULL|     2571|
|   LL|        1|
|   PA|     1709|
|   AP|    47965|
|   RL|        0|
+-----+---------+



In [145]:
""" No finding `Semantics Check` """

# Create a list of boolean expressions
other_labels = [(F.col(l) == 1.0) for l in label_cols if l != "No Finding"]

df_train.filter(
    (F.col("No Finding") == 1.0) & 
    reduce(or_, other_labels)
).count()

8808

In [150]:
# Path column inspection (patient-level leakage risk)
# Path format: CheXpert-v1.0-small/train/patientXXXXX/studyY/viewZ_*.jpg

df_train = df_train.withColumn("patient_id", F.split(F.col("Path"), "/").getItem(2))

patient_counts = df_train.groupBy("patient_id").count()

print("Unique patients:", patient_counts.count())
patient_counts.orderBy(F.desc("count")).show(20, truncate=False)

+----------+------+
|patient_id| count|
+----------+------+
|     train|223414|
+----------+------+



In [None]:
# Semantic EDA: label summary + plots

LABEL_COLS = [
    "No Finding", "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion", "Edema",
    "Consolidation", "Pneumonia", "Atelectasis", "Pneumothorax", "Pleural Effusion", "Pleural Other",
    "Fracture", "Support Devices",
]

TOTAL = df_train.count()

# Per-label counts for each semantic state: {1, 0, -1, null}
summary = None
for label in LABEL_COLS:
    c = F.col(label)
    row = df_train.select(
        F.lit(label).alias("label"),
        F.sum(F.when(c == 1.0, 1).otherwise(0)).alias("pos"),
        F.sum(F.when(c == 0.0, 1).otherwise(0)).alias("neg"),
        F.sum(F.when(c == -1.0, 1).otherwise(0)).alias("uncertain"),
        F.sum(F.when(c.isNull(), 1).otherwise(0)).alias("null"),
    )
    summary = row if summary is None else summary.unionByName(row)

summary = (summary
    .withColumn("total", F.lit(TOTAL))
    .withColumn("mention", F.col("total") - F.col("null"))
    .withColumn("mention_rate", F.col("mention") / F.col("total"))
    .withColumn("pos_rate", F.col("pos") / F.col("total"))
    .withColumn("uncertain_rate", F.col("uncertain") / F.col("total"))
)

summary_pd = summary.toPandas().sort_values("pos_rate", ascending=False)
display(summary_pd)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.barplot(data=summary_pd, y="label", x="pos_rate", ax=axes[0], color=sns.color_palette()[0])
axes[0].set_title("Positive prevalence")
axes[0].set_xlabel("Fraction of all images")
axes[0].set_ylabel("")

sns.barplot(data=summary_pd.sort_values("uncertain_rate", ascending=False), y="label", x="uncertain_rate", ax=axes[1], color=sns.color_palette()[1])
axes[1].set_title("Uncertainty rate (-1)")
axes[1].set_xlabel("Fraction of all images")
axes[1].set_ylabel("")

sns.barplot(data=summary_pd.sort_values("mention_rate", ascending=False), y="label", x="mention_rate", ax=axes[2], color=sns.color_palette()[2])
axes[2].set_title("Mention rate (non-null)")
axes[2].set_xlabel("Fraction of all images")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

# Multi-label density (how many positives per image) 
positive_sum_expr = " + ".join([f"IF(`{l}`= 1.0, 1, 0)" for l in LABEL_COLS])

df_card = df_train.withColumn("num_positive", F.expr(positive_sum_expr))
card_counts = df_card.groupBy("num_positive").count().orderBy("num_positive")
card_pd = card_counts.toPandas()

avg_pos_labels = (card_pd["num_positive"] * card_pd["count"]).sum() / TOTAL
print("Average number of positive labels per image:", avg_pos_labels)

display(card_pd)

plt.figure(figsize=(8, 4))
sns.barplot(data=card_pd, x="num_positive", y="count", color=sns.color_palette()[3])
plt.yscale("log")
plt.title("Label cardinality: positives per image (log count)")
plt.xlabel("# positive labels")
plt.ylabel("# images (log scale)")
plt.tight_layout()
plt.show()

# --- 'No Finding' semantic consistency ---
other_positive = [(F.col(l) == 1.0) for l in LABEL_COLS if l != "No Finding"]
contradictions = df_train.filter((F.col("No Finding") == 1.0) & reduce(or_, other_positive)).count()
no_finding_pos = df_train.filter(F.col("No Finding") == 1.0).count()

print("No Finding positives:", no_finding_pos)
print("Contradictions (No Finding == 1 AND another label == 1):", contradictions)
print("Contradiction rate among all images:", contradictions / TOTAL)
print("Contradiction rate among No Finding positives:", (contradictions / no_finding_pos) if no_finding_pos else None)

plt.figure(figsize=(6, 3))
sns.barplot(x=["All images", "No Finding positives"], y=[contradictions / TOTAL, (contradictions / no_finding_pos) if no_finding_pos else 0], color=sns.color_palette()[4])
plt.title("No Finding inconsistency rate")
plt.ylabel("Fraction")
plt.tight_layout()
plt.show()

# Patient-level multiplicity (leakage risk)
try:
    patient_counts
except NameError:
    df_train = df_train.withColumn("patient_id", F.split(F.col("Path"), "/").getItem(2))
    patient_counts = df_train.groupBy("patient_id").count()

img_per_patient_dist = (
    patient_counts
    .groupBy(F.col("count").alias("images_per_patient"))
    .agg(F.count(F.lit(1)).alias("n_patients"))
    .orderBy("images_per_patient")
)

dist_pd = img_per_patient_dist.toPandas()

max_imgs = patient_counts.agg(F.max("count").alias("max")).collect()[0]["max"]
print("Max images per patient:", max_imgs)

plt.figure(figsize=(8, 4))
sns.barplot(data=dist_pd, x="images_per_patient", y="n_patients", color=sns.color_palette()[5])
plt.yscale("log")
plt.title("Images per patient distribution (log #patients)")
plt.xlabel("# images for a patient")
plt.ylabel("# patients (log scale)")
plt.tight_layout()
plt.show()

In [None]:
# --- Label co-occurrence structure (semantic relationships) ---

COOC_LABELS = [l for l in LABEL_COLS if l != "No Finding"]

label_to_safe = {l: l.replace(" ", "_") for l in COOC_LABELS}
safe_to_label = {v: k for k, v in label_to_safe.items()}
safe_cols = [label_to_safe[l] for l in COOC_LABELS]

binary = df_train.select(
    *[
        F.when(F.col(lbl) == 1.0, 1).otherwise(0).cast("int").alias(label_to_safe[lbl])
        for lbl in COOC_LABELS
    ]
)

# Aggregate all pairwise co-occurrences in one pass
exprs = []
for i, a in enumerate(safe_cols):
    for b in safe_cols[i:]:
        exprs.append(F.sum(F.col(a) * F.col(b)).alias(f"{a}__{b}"))

agg = binary.agg(*exprs).toPandas().iloc[0].to_dict()

pos = {a: float(agg[f"{a}__{a}"]) for a in safe_cols}

import numpy as np

n = len(safe_cols)
cond = np.zeros((n, n), dtype=float)
for i, a in enumerate(safe_cols):
    for j, b in enumerate(safe_cols):
        key = f"{a}__{b}" if i <= j else f"{b}__{a}"
        co = float(agg[key])
        cond[i, j] = (co / pos[a]) if pos[a] > 0 else 0.0

cond_pd = pd.DataFrame(cond, index=[safe_to_label[c] for c in safe_cols], columns=[safe_to_label[c] for c in safe_cols])

plt.figure(figsize=(10, 8))
sns.heatmap(cond_pd, cmap="rocket_r", vmin=0, vmax=1)
plt.title("Conditional co-occurrence: P(col=1 | row=1)")
plt.xlabel("Label B")
plt.ylabel("Label A")
plt.tight_layout()
plt.show()

# Top semantic relationships (excluding diagonal)
pairs = []
for i, a in enumerate(safe_cols):
    for j, b in enumerate(safe_cols):
        if i == j:
            continue
        pairs.append((safe_to_label[a], safe_to_label[b], cond[i, j], pos[a]))

pairs_sorted = sorted(pairs, key=lambda x: x[2], reverse=True)
print("Top conditional relationships (A -> B means P(B|A)):")
for A, B, p, support in pairs_sorted[:15]:
    print(f"{A:26s} -> {B:26s}  P(B|A)={p:0.2f}  support(A)={int(support)}")

## Strategy for working with CheXpert (practical recommendations)

### 1) Define the prediction task clearly
- **Task type**: multi-label classification over the 14 CheXpert labels.
- **Metadata**: treat `Sex`, `Age`, and acquisition/view (`Frontal/Lateral`, `AP/PA`) as *potential confounders*.

### 2) Splitting (avoid leakage)
- **Always split by `patient_id`** (group split) to prevent the same patient appearing in train/val/test.
- **Stratification**: because this is multi-label, stratify approximately on a small set of high-impact signals:
  - major labels (e.g., `Edema`, `Pleural Effusion`, `Atelectasis`),
  - and **view mix** (`AP/PA`, `Frontal/Lateral`).
- **Recommended**: build a patient-level table (1 row per patient) with label indicators (any positive in patient) + view proportions, then split patients to balance those distributions.

### 3) Label semantics: `null`, `-1`, and noisy `No Finding`
- **`null` (not mentioned)** is not the same as negative; it is *missing / unobserved*.
  - Safer option: train with a **loss mask** so you compute loss only when a label is **mentioned (non-null)**.
  - Baseline option (CheXpert-style): map `null -> 0` and accept some label noise (often works surprisingly well, but biases prevalence).
- **Uncertainty (`-1`)**: evaluate a few policies and pick based on validation AUPRC/AUROC:
  - **U-ignore**: treat `-1` as missing (masked).
  - **U->0**: treat uncertain as negative (common for some labels).
  - **U->1**: treat uncertain as positive for labels where uncertainty often implies presence.
- **`No Finding` is noisy** (contradictions exist). Prefer one of:
  - **Drop `No Finding` from targets** and derive it post-hoc as “no other finding predicted”.
  - Or **recompute** a clean `NoFinding_derived = 1` when all other findings are not positive (after you define your missing/uncertainty policy).

### 4) Confounders (views + devices)
- Since label prevalence changes strongly by **AP vs PA** (and lateral has different metadata), treat view as a confounder:
  - Train on a consistent subset (e.g., **Frontal only**, optionally **PA only**) for the cleanest baseline.
  - Or include view metadata as model input, and evaluate performance within each view.
- `Support Devices` often acts as a proxy for severity/setting; don’t let it inadvertently become a shortcut:
  - audit co-occurrence, and report metrics with/without device-heavy subsets.

### 5) Class imbalance + evaluation
- Use **per-label class weighting** and/or **focal loss**; consider oversampling rare-label positives at the patient level.
- Report:
  - **AUROC + AUPRC** per label (AUPRC is critical for rare labels),
  - calibration (e.g., reliability plots),
  - and **subgroup metrics** (Sex, Age bins, AP vs PA).

### 6) Minimal robust baseline pipeline
- **Filter**: start with `Frontal` only.
- **Split**: patient-level split with approximate stratification.
- **Labels**: start with `null -> 0`, `-1 -> 0` (baseline), then compare against masked training and/or `-1` policies.
- **Model**: multi-label CNN with sigmoid outputs.
- **Validation**: per-label AUPRC + view-stratified metrics, plus a check that performance doesn’t collapse on PA vs AP.
