# Dx Accuracy

In [None]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import statsmodels.api as sm
from scipy.stats import spearmanr


In [None]:
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-UniversitédeGenève/icu_research/prehospital/pediatric_trauma/data/Data_PedRegaTrauma_coded_for_analysis_250417.xlsx'

In [None]:
add_poly_trauma_codes = True
polytrauma_threshold = 2
add_in_hospital_accessory_diagnoses = True

In [None]:
df = pd.read_excel(data_path, sheet_name='All centres cleaned')

## Discrepancy between main diagnosis out-of-hospital and in-hospital


In [None]:
# if in hospital accessory diagnoses should be accounted for, they should first be preprocessed 
if add_in_hospital_accessory_diagnoses:
    df["other_diagnosis_cleaned"] = df["Other diagnosis 1"].copy()
    df["other_diagnosis_cleaned"] = df["other_diagnosis_cleaned"].fillna('').astype(str)
    # replace "." with ","
    df["other_diagnosis_cleaned"] = df["other_diagnosis_cleaned"].str.replace('.', ',', regex=False)
    # remove all non numeric characters except for commas (integers as strings should remain, integers should remain)
    df["other_diagnosis_cleaned"] = df["other_diagnosis_cleaned"].str.replace('[^0-9,]', '', regex=True)
    # remove leading and trailing commas
    df["other_diagnosis_cleaned"] = df["other_diagnosis_cleaned"].str.strip(',')
    df["other_diagnosis_cleaned"] = df["other_diagnosis_cleaned"].str.replace(r',+', ',', regex=True)


    df['temp'] = df['Main diagnosis in-hospital']
    # append other_diagnosis_cleaned to Main diagnosis in-hospital, separated by comma 
    df['Main diagnosis in-hospital'] = df.apply(
        lambda row: f"{row['temp']},{row['other_diagnosis_cleaned']}" if pd.notna(row['other_diagnosis_cleaned']) and row['other_diagnosis_cleaned'] != '' else row['temp'],
        axis=1
    )


In [None]:
df[['temp', 'other_diagnosis_cleaned', 'Main diagnosis in-hospital']] 

In [None]:
# Discrepancy between main diagnosis out-of-hospital and in-hospital
df['Main diagnosis pre-hospital'] = df['Main diagnosis pre-hospital'].replace('<NA>', pd.NA)
df['Main diagnosis pre-hospital'] = df['Main diagnosis pre-hospital'].replace('Vd. a. Asphiktische REA', 10)
df['Main diagnosis pre-hospital'] = df['Main diagnosis pre-hospital'].replace('1. CO Intoxikation durch Rauchgasvergiftung (Kachelofen)\n   - CO 20%\n   - Schwindel, Unwohlsein, fragliche krampfartigen Äquivalente', 0)
df['Main diagnosis pre-hospital'] = df['Main diagnosis pre-hospital'].replace('1. CO INtoxikation durch Rauchgasvergiftung (Kachelofen) mit\n   - Krampfäquivalent, Schwindel, Übelkeit\n   - CO 22%', 0)

# use apply to apply the previous replacement to each element in the column
df['Main diagnosis in-hospital'] = df['Main diagnosis in-hospital'].apply(lambda x: x.replace('C2-Intoxikation,', '') if isinstance(x, str) else x)
df['Main diagnosis in-hospital'] = df['Main diagnosis in-hospital'].apply(lambda x: x.replace('nan,', '') if isinstance(x, str) else x)
# Obstrukt.Atemversagen -REA
df['Main diagnosis in-hospital'] = df['Main diagnosis in-hospital'].apply(lambda x: x.replace('Obstrukt.Atemversagen -REA', '10') if isinstance(x, str) else x)

# replace 999 with pd.NA
df['Main diagnosis pre-hospital'] = df['Main diagnosis pre-hospital'].replace(999, pd.NA)
df['Main diagnosis in-hospital'] = df['Main diagnosis in-hospital'].replace(999, pd.NA)

dx_df = df[['Pat ID', 'Main diagnosis pre-hospital', 'Main diagnosis in-hospital']].copy()
dx_df['to_remvove'] = False
for i, row in dx_df.iterrows():
    # if row ['Main diagnosis pre-hospital'] is not pd.NA and a number, skip
    if pd.isna(row['Main diagnosis pre-hospital']) or isinstance(row['Main diagnosis pre-hospital'], (int)):
        continue
    
    pre_list_of_diagnoses = [int(dx.strip()) for dx in row['Main diagnosis pre-hospital'].split(',')]
    if add_poly_trauma_codes and len(pre_list_of_diagnoses) >= polytrauma_threshold and 9 not in pre_list_of_diagnoses:
        pre_list_of_diagnoses.append(9)
    pre_list_of_possible_diagnoses = [x if x in pre_list_of_diagnoses else pd.NA for x in range(1, 12)]

    intra_list_of_diagnoses = [int(dx.strip()) for dx in str(row['Main diagnosis in-hospital']).split(',')]
    if add_poly_trauma_codes and len(intra_list_of_diagnoses) >= polytrauma_threshold and 9 not in intra_list_of_diagnoses:
        intra_list_of_diagnoses.append(9)
    intra_list_of_possible_diagnoses = [x if x in intra_list_of_diagnoses else pd.NA for x in range(1, 12)]

    for idx in range(1, 12):
        # add a new row to the dataframe with the Pat ID and the diagnoses
        dx_df.loc[len(dx_df)] = [row['Pat ID'], pre_list_of_possible_diagnoses[idx-1], intra_list_of_possible_diagnoses[idx-1], False]
    # mark current row for removal
    dx_df.at[i, 'to_remvove'] = True

# remove rows marked for removal
dx_df = dx_df[~dx_df['to_remvove']]
# drop the 'to_remvove' column
dx_df = dx_df.drop(columns=['to_remvove'])

# remove rows where both diagnoses are pd.NA
dx_df = dx_df[~(dx_df['Main diagnosis pre-hospital'].isna() & dx_df['Main diagnosis in-hospital'].isna())]

# dx_df['Main diagnosis in-hospital'] = dx_df['Main diagnosis in-hospital'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else x)
# Split comma-separated diagnoses and convert to int (keeping NaN as is)
dx_df['Main diagnosis in-hospital'] = dx_df['Main diagnosis in-hospital'].apply(
    lambda x: int(x.split(',')[0]) if isinstance(x, str) else x
)


In [None]:
with_missing_dx_df = dx_df.copy()
# replace nan with -1 for "no diagnosis"
with_missing_dx_df['Main diagnosis pre-hospital'] = with_missing_dx_df['Main diagnosis pre-hospital'].fillna(-1)  # -1 for "no diagnosis"
with_missing_dx_df['Main diagnosis in-hospital'] = with_missing_dx_df['Main diagnosis in-hospital'].fillna(-1)  # -1 for "no diagnosis"
dx_kappa = cohen_kappa_score(with_missing_dx_df['Main diagnosis pre-hospital'].values.astype(int), with_missing_dx_df['Main diagnosis in-hospital'].values.astype(int))
dx_kappa

In [None]:
# Dx codes
# Traumatic brain injury, HWS	1
# Chest trauma	2
# Abdominal trauma	3
# Pelvic Trauma/ LWS	4
# Upper extremity trauma	5
# Lower extremity trauma	6
# Spine injury	7
# Face	8
# Polytrauma	9
# Ertrinken	10
# Ganzkörperverbrennung	11

dx_code_to_name = {
    0: 'No diagnosis',
    1: 'Traumatic brain or cervical spine injury',
    2: 'Chest trauma',
    3: 'Abdominal trauma',
    4: 'Pelvic Trauma/ LWS',
    5: 'Upper extremity trauma',
    6: 'Lower extremity trauma',
    7: 'Spine injury',
    8: 'Face',
    9: 'Polytrauma',
    10: 'Drowning',
    11: 'Burns'
}

In [None]:
def to_set(series):
    # all non-missing labels for that rater & patient
    return set(series.dropna().astype(int))

dx_sets = (
    dx_df
    .groupby('Pat ID')
    .agg(pre_labels=('Main diagnosis pre-hospital', to_set),
         in_labels=('Main diagnosis in-hospital', to_set))
)

In [None]:
def jaccard(row):
    A, B = row.pre_labels, row.in_labels
    if not A and not B:
        return 1.0   # or 0.0, depending on your convention
    return len(A & B) / len(A | B)

dx_sets['jaccard'] = dx_sets.apply(jaccard, axis=1)
mean_jaccard = dx_sets['jaccard'].mean()

def set_f1(row):
    A, B = row.pre_labels, row.in_labels
    if not A and not B:
        return 1.0
    return 2 * len(A & B) / (len(A) + len(B))

dx_sets['set_f1'] = dx_sets.apply(set_f1, axis=1)
mean_set_f1 = dx_sets['set_f1'].mean()

mean_jaccard, mean_set_f1

In [None]:
all_pre = set().union(*dx_sets.pre_labels)
all_in  = set().union(*dx_sets.in_labels)
all_labels = sorted(all_pre | all_in)

In [None]:
index = dx_sets.index

r1 = pd.DataFrame(0, index=index, columns=all_labels)
r2 = pd.DataFrame(0, index=index, columns=all_labels)

for pid, row in dx_sets.iterrows():
    for l in row.pre_labels:
        r1.loc[pid, l] = 1
    for l in row.in_labels:
        r2.loc[pid, l] = 1

In [None]:
kappa_global = cohen_kappa_score(
    r1.values.flatten(),
    r2.values.flatten()
)

percent_agreement_global = (r1.values.flatten() == r2.values.flatten()).mean()

kappa_global, percent_agreement_global

In [None]:
# compute 95% confidence interval for kappa_global by bootstrapping (1000 resamples)
n_bootstraps = 1000
kappa_bootstrap = []
import numpy as np
rng = np.random.default_rng(seed=42)
for _ in range(n_bootstraps):
    sample_indices = rng.choice(len(dx_sets), size=len(dx_sets), replace=True)
    r1_sample = r1.iloc[sample_indices]
    r2_sample = r2.iloc[sample_indices]
    kappa_sample = cohen_kappa_score(
        r1_sample.values.flatten(),
        r2_sample.values.flatten()
    )
    kappa_bootstrap.append(kappa_sample)
kappa_lower = np.percentile(kappa_bootstrap, 2.5)
kappa_upper = np.percentile(kappa_bootstrap, 97.5)

print(f"Kappa: {kappa_global:.4f} (95% CI {kappa_lower:.4f} - {kappa_upper:.4f})")

In [None]:
kappa_per_label = {}
pa_per_label = {}

for l in all_labels:
    kappa_per_label[dx_code_to_name[l]] = cohen_kappa_score(r1[l].values, r2[l].values)
    pa_per_label[dx_code_to_name[l]] = (r1[l].values == r2[l].values).mean()

kappa_per_label, pa_per_label

In [None]:
from sklearn.metrics import precision_recall_fscore_support

y_true = r2.values.flatten()
y_pred = r1.values.flatten()

precision, recall, f1, _ = precision_recall_fscore_support(
    y_true, y_pred,
    average='micro'
)

precision, recall, f1

In [None]:
dx_sets['exact_match'] = (dx_sets.pre_labels == dx_sets.in_labels)
subset_accuracy = dx_sets['exact_match'].mean()

subset_accuracy

In [None]:
import pandas as pd

# Rater A (pre-hospital)
rA = (
    dx_df
      .dropna(subset=['Main diagnosis pre-hospital'])
      .assign(val=1)
      .pivot_table(
          index='Pat ID',
          columns='Main diagnosis pre-hospital',
          values='val',
          aggfunc='max',    # 1 if label ever used for that patient
          fill_value=0
      )
)

# Rater B (in-hospital) - treated here as "reference"
rB = (
    dx_df
      .dropna(subset=['Main diagnosis in-hospital'])
      .assign(val=1)
      .pivot_table(
          index='Pat ID',
          columns='Main diagnosis in-hospital',
          values='val',
          aggfunc='max',
          fill_value=0
      )
)

# Align patients & label columns so shapes match
rA, rB = rA.align(rB, join='outer', axis=0, fill_value=0)
rA, rB = rA.align(rB, join='outer', axis=1, fill_value=0)

# rA and rB are now [n_patients x n_labels] matrices of 0/1


In [None]:
from sklearn.metrics import accuracy_score, f1_score

labels = rA.columns
rows = []

for label in labels:
    y_true = rB[label].values   # reference (rater B)
    y_pred = rA[label].values   # predictions (rater A)

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, zero_division=0)

    rows.append({
        'label': label,
        'accuracy': acc,
        'f1': f1
    })

per_label_metrics = pd.DataFrame(rows).set_index('label')
print(per_label_metrics)


### Accuracy per dx present in pre-hosp stage

In [None]:
# get accuracy and count for every pre hospital diagnosis
accuracy_per_diagnosis = {}
count_per_diagnosis = {}
for dx in dx_df['Main diagnosis pre-hospital'].unique():
    if pd.isna(dx):
        continue
    dx_subset = dx_df[dx_df['Main diagnosis pre-hospital'] == dx]
    if len(dx_subset) < 2:
        continue
    accuracy = (dx_subset['Main diagnosis pre-hospital'] == dx_subset['Main diagnosis in-hospital']).mean()
    accuracy_per_diagnosis[dx] = accuracy
    count_per_diagnosis[dx] = len(dx_subset)

accuracy_per_diagnosis_df = pd.DataFrame.from_dict(accuracy_per_diagnosis, orient='index', columns=['Accuracy'])
accuracy_per_diagnosis_df.index.name = 'Main diagnosis pre-hospital'
accuracy_per_diagnosis_df.reset_index(inplace=True)
accuracy_per_diagnosis_df['Count'] = accuracy_per_diagnosis_df['Main diagnosis pre-hospital'].map(count_per_diagnosis)
# map dx code to diagnosis name

accuracy_per_diagnosis_df['Main diagnosis pre-hospital'] = accuracy_per_diagnosis_df['Main diagnosis pre-hospital'].map(dx_code_to_name)
# put count column before accuracy
accuracy_per_diagnosis_df = accuracy_per_diagnosis_df[['Main diagnosis pre-hospital', 'Count', 'Accuracy']]
accuracy_per_diagnosis_df

### Output

In [None]:
# transform kappa_per_label into df and add overall kappa
kappa_df = pd.DataFrame.from_dict(kappa_per_label, orient='index', columns=['Cohen\'s Kappa'])
kappa_df.index.name = 'Diagnosis'
kappa_df.loc['Overall'] = kappa_global
kappa_df

In [None]:
# save to_excel 
# kappa_df.to_excel('/Users/jk1/Library/CloudStorage/OneDrive-unige.ch/icu_research/prehospital/pediatric_trauma/tables/dx_kappa.xlsx')