# Physician exp vs NACA

In [None]:
import pandas as pd
import re
import numpy as np

In [None]:
data_path = '/Users/jk1/Library/CloudStorage/OneDrive-UniversitédeGenève/icu_research/prehospital/analgesia/data/rega_data/trauma_categories_Rega Pain Study15.09.2025_v2.xlsx'
medic_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-UniversitédeGenève/icu_research/prehospital/analgesia/data/rega_data/rega_physician_list_09102025.xlsx'
meta_medic_data_path = '/Users/jk1/Library/CloudStorage/OneDrive-UniversitédeGenève/icu_research/prehospital/analgesia/data/medreg_extraction/joined_final_complete_extractions_20251008_221735.xlsx'

In [None]:
restrict_to_trauma = True
restrict_to_primary = True

In [None]:
data_df = pd.read_excel(data_path)
medic_df = pd.read_excel(medic_data_path)
meta_medic_df = pd.read_excel(meta_medic_data_path)

In [None]:
medic_df['full_name'] = medic_df['Mitglieder mit Einsatzfunktion'].str.replace(' (Flugarzt/Flugärztin)', '')
medic_df.drop_duplicates(subset=['Mitglieder mit Einsatzfunktion'], inplace=True)
medic_df = medic_df.merge(meta_medic_df, how='left', on='full_name')
medic_df.rename(columns={'Sex m/w': 'physician_sex'}, inplace=True)
data_df = data_df.merge(medic_df, how='left', left_on='Mitglieder mit Einsatzfunktion', right_on='Mitglieder mit Einsatzfunktion')

In [None]:
duplicates = data_df[data_df["SNZ Ereignis Nr. "].duplicated()]["SNZ Ereignis Nr. "]
print(f'Duplicates found: {duplicates.nunique()}')
# drop duplicates
data_df = data_df.drop_duplicates(subset=["SNZ Ereignis Nr. "])

In [None]:
n_vas_under4 = data_df[data_df["VAS_on_scene"] <= 3].shape[0]
print(f'Excluded {n_vas_under4} patients with VAS <= 3')

# adult patients with vas <= 3
n_adult_vas_under4 = data_df[(data_df["VAS_on_scene"] <= 3) & (data_df["Alter "] >= 16)].shape[0]
print(f'Excluded {n_adult_vas_under4} adult patients with VAS <= 3')

# pediatric patients with vas <= 3
n_pediatric_vas_under4 = data_df[(data_df["VAS_on_scene"] <= 3) & (data_df["Alter "] < 16)].shape[0]
print(f'Excluded {n_pediatric_vas_under4} pediatric patients with VAS <= 3')

data_df = data_df[data_df["VAS_on_scene"] > 3]

In [None]:
data_df['Einteilung (reduziert)'].value_counts()

In [None]:
if restrict_to_trauma:
    n_non_trauma = data_df[data_df['Einteilung (reduziert)'] != 'Unfall'].shape[0]
    print(f'Excluded {n_non_trauma} non-trauma patients')

    # adult non-trauma patients
    n_adult_non_trauma = data_df[(data_df['Einteilung (reduziert)'] != 'Unfall') & (data_df["Alter "] >= 16)].shape[0]
    print(f'Excluded {n_adult_non_trauma} adult non-trauma patients')
    # pediatric non-trauma patients
    n_pediatric_non_trauma = data_df[(data_df['Einteilung (reduziert)'] != 'Unfall') & (data_df["Alter "] < 16)].shape[0]
    print(f'Excluded {n_pediatric_non_trauma} pediatric non-trauma patients')

    data_df = data_df[data_df['Einteilung (reduziert)'] == 'Unfall']

In [None]:
if restrict_to_primary:
    n_secondary = data_df[data_df['Einsatzart'] != 'Primär'].shape[0]
    print(f'Excluded {n_secondary} secondary transport patients')

    # adult secondary transport patients
    n_adult_secondary = data_df[(data_df['Einsatzart'] != 'Primär') & (data_df["Alter "] >= 16)].shape[0]
    print(f'Excluded {n_adult_secondary} adult secondary transport patients')
    # pediatric secondary transport patients
    n_pediatric_secondary = data_df[(data_df['Einsatzart'] != 'Primär') & (data_df["Alter "] < 16)].shape[0]
    print(f'Excluded {n_pediatric_secondary} pediatric secondary transport patients')
    data_df = data_df[data_df['Einsatzart'] == 'Primär']


In [None]:
adult_df = data_df[data_df["Alter "] >= 16]
pediatric_df = data_df[data_df["Alter "] < 16]

In [None]:
def get_multi_label_counts(data_df, multi_label_column):
    data_df[multi_label_column] = data_df[multi_label_column].replace(999, pd.NA)
    label_counter = {}
    # iterate through the rows
    for index, row in data_df.iterrows():
        # split by comma then strip spaces
        labels = [label.strip() for label in str(row[multi_label_column]).split(',')]
        # if label not in the dict, add it
        for label in labels:
            if label == 'nan' or label == '<NA>':
                continue
            if label not in label_counter:
                label_counter[label] = 1
            else:
                label_counter[label] += 1

    # sort the dictionary by value
    sorted_label_counter = dict(sorted(label_counter.items(), key=lambda item: item[1], reverse=True))
    return sorted_label_counter

In [None]:
def preprocess_body_region(df):
    # create a new column Körperregion2 with the first entry of Körperregion
    df['Körperregion2'] = df['Körperregion'].str.split(',').str[0]
    # strip spaces
    df['Körperregion2'] = df['Körperregion2'].str.strip()
    # fill na in Körperregion with Körperregion2
    df['Körperregion'] = df['Körperregion'].fillna(df['Körperregion2'])
    # set to lower
    df['Körperregion'] = df['Körperregion'].str.lower()
    # replace kopf with schädel-hirn
    df['Körperregion'] = df['Körperregion'].replace({'kopf': 'schädel-hirn'})
    # replace 'kopf/gesicht' / 'augen' / 'kopf/hals' / 'kopf (gehör)' / 'kopf/gesicht/hals' with 'gesicht'
    df['Körperregion'] = df['Körperregion'].replace({'kopf/gesicht': 'gesicht', 'augen': 'gesicht', 'kopf/hals': 'gesicht', 'kopf (gehör)': 'gesicht', 'kopf/gesicht/hals': 'gesicht'})
    # replace 'rücken' with 'wirbelsäule'
    df['Körperregion'] = df['Körperregion'].replace({'rücken': 'wirbelsäule'})
    # replace 'rump' with 'thorax'
    df['Körperregion'] = df['Körperregion'].replace({'rump': 'thorax'})
    # replace  'obere extremiät' with 'obere extremität'    
    df['Körperregion'] = df['Körperregion'].replace({'obere extremiät': 'obere extremität'})
    # replace 'untere extermität' with 'untere extremität'    
    df['Körperregion'] = df['Körperregion'].replace({'untere extermität': 'untere extremität'})
    # replace '' with pd.NA
    df['Körperregion'] = df['Körperregion'].replace({'': pd.NA})

    return df

In [None]:
def get_categorical_str(df, column_name, category, total):
    """
    Get categorical string features from the DataFrame.
    """
    count = df[df[column_name] == category].shape[0]
    return f'{count} ({count/total:.1%})'

def get_continuous_str(df, column_name, total):
    """
    Get continuous string features from the DataFrame.
    """
    median = df[column_name].median()
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    return f'{median:.1f} [{q1:.1f} - {q3:.1f}]'

def get_multi_label_counts(data_df, multi_label_column):
    data_df[multi_label_column] = data_df[multi_label_column].replace(999, pd.NA)
    label_counter = {}
    # iterate through the rows
    for index, row in data_df.iterrows():
        # split by comma then strip spaces
        labels = [label.strip() for label in re.split('; |, ', str(row[multi_label_column]))]
        # if label not in the dict, add it
        for label in labels:
            if label == 'nan' or label == '<NA>':
                continue
            if label not in label_counter:
                label_counter[label] = 1
            else:
                label_counter[label] += 1

    # sort the dictionary by value
    sorted_label_counter = dict(sorted(label_counter.items(), key=lambda item: item[1], reverse=True))
    return sorted_label_counter

In [None]:
adult_df['NACA (nummerisch)'].unique()

In [None]:
adult_df['event_year'] = pd.to_datetime(adult_df['Ereignisdatum'], format='%d.%m.%Y').dt.year
adult_df['physician_age'] = adult_df['event_year'] - adult_df['year_of_birth']
# physician year of final exam (from licence_date which can be either d.m.Y or Y)
adult_df['physician_licence_year'] = adult_df['licence_date'].apply(lambda x: str(x).split('.')[-1] if '.' in str(x) else str(x))
adult_df['phyisician_experience_years'] = adult_df['event_year'] - pd.to_numeric(adult_df['physician_licence_year'], errors='coerce')

In [None]:
# plot distribution NACA score by physician experience
import matplotlib.pyplot as plt
import seaborn as sns

# drop NACA 1 form the plot
adult_df = adult_df[adult_df['NACA (nummerisch)'] != 1]

fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
sns.violinplot(x='NACA (nummerisch)', y='phyisician_experience_years', data=adult_df, hue='NACA (nummerisch)', split=False,
               palette='Set2', ax=ax)
ax.set_xlabel('NACA')
ax.set_ylabel('Physician Experience (years)')

# remove legend
ax.legend_.remove()

#  remove top and right spines
sns.despine()

# add grid lines
ax.yaxis.grid(True)

plt.show()

In [None]:
# fig.savefig('/Users/jk1/Library/CloudStorage/OneDrive-UniversitédeGenève/icu_research/prehospital/analgesia/analysis/adult_trauma/physician_experience_vs_NACA_violinplot.png', dpi=300)

In [None]:
# Spearman’s rank correlation coefficient between NACA score and physician experience
from scipy.stats import spearmanr

corr, p_value = spearmanr(adult_df['NACA (nummerisch)'], adult_df['phyisician_experience_years'])
print(f"Spearman's rank correlation coefficient: {corr:.3f}, p-value: {p_value}")