# 文件导入

In [2]:
import pandas as pd
import numpy as np
import re
import argparse
pd.options.display.max_rows=300

import amsterdamumcdb as adb

![image.png](attachment:image.png)

In [226]:
def parse_args():
    parser = argparse.ArgumentParser(
        description='''Compare the number of cases of sepsis at admission for
            for the Sepsis-3 and the previous sepsis definition.''')
    parser.add_argument(
        '--data_file_path',
        default='../../AmsterdamUMCdb-v1/',
        help='''File path to the directory that contains the base
            AmsterdamUMCdb .csv files. These files are not directly available
            from the AmsterdamUMCdb GitHub page, and access must be
            specifically requested from Amsterdam UMC.
            (default: %(default)s)''',
        type=str)
    parser.add_argument(
        '--output_file_path',
        default='./out/',
        help='''File path to the directory that will contain the output of this
            script, which should be the same output file path as in the script
            sepsis3_amsterdamumcdb.py (default: %(default)s)''',
        type=str)
    parser.add_argument(
        '--print_reason_for_admission_table',
        default=False,
        help='''Print the reason admission table giving the number admissions
        in various surgical and medical categories (default: %(default)s)''',
        type=bool)
    args = parser.parse_args(args=[])
    return args

inputs = parse_args()

In [4]:
dictionary = adb.get_dictionary()

list_columns = ['admissionid', 'itemid', 'valueid', 'value']
list_columns += ['measuredat', 'updatedat', 'registeredby']
listitems = pd.read_csv(
    inputs.data_file_path + 'listitems.csv', usecols=list_columns, encoding='latin-1')
listitems['measuredat'] //= (1000*60*60)  #转为小时
listitems['time'] = listitems['measuredat']

drug_columns = ['admissionid', 'itemid', 'item', 'duration', 'rate']
drug_columns += ['rateunit', 'start', 'stop', 'dose', 'doserateperkg']
drug_columns += ['doseunitid', 'doserateunitid', 'ordercategoryid']
drugitems = pd.read_csv(
    inputs.data_file_path + 'drugitems.csv', usecols=drug_columns, encoding='latin-1')
drugitems['start'] //= (1000*60*60)  #转为小时
drugitems['stop'] //= (1000*60*60)  #转为小时

procedureorder_columns = ['admissionid', 'itemid', 'item', 'registeredat']
procedureorderitems = pd.read_csv(
    inputs.data_file_path + 'procedureorderitems.csv',
    usecols=procedureorder_columns, encoding='latin-1')
procedureorderitems['registeredat'] //= (1000*60*60)  #转为小时

freetextitems = pd.read_csv(inputs.data_file_path + 'freetextitems.csv', encoding='latin-1')
freetextitems['measuredat'] //= (1000*60*60)  #转为小时



# sepsis条件1：感染定义

In [227]:
# Antibiotics

antibiotics = pd.DataFrame(columns=['itemid', 'rank'])
antibiotics.loc[0] = [7185, 1]  # Doxycycline (Vibramycine)
antibiotics.loc[1] = [9142, 2]  # Tetracycline
antibiotics.loc[2] = [19764, 4]  # Tigecycline (Tygacil)
antibiotics.loc[3] = [9047, 2]  # Chlooramfenicol
antibiotics.loc[4] = [6847, 1]  # Amoxicilline (Clamoxyl/Flemoxin)
antibiotics.loc[5] = [9128, 3]  # Piperacilline (Pipcil)
antibiotics.loc[6] = [6871, 1]  # Benzylpenicilline (Penicilline)
antibiotics.loc[7] = [9037, 1]  # Feneticilline (Broxil)
antibiotics.loc[8] = [9029, 2]  # Amoxicilline/Clavulaanzuur (Augmentin)
antibiotics.loc[9] = [9152, 1]  # Cefazoline (Kefzol)
antibiotics.loc[10] = [9151, 1]  # Cefuroxim (Zinacef)
antibiotics.loc[11] = [6917, 2]  # Ceftazidim (Fortum)
antibiotics.loc[12] = [9133, 2]  # Ceftriaxon (Rocephin)
antibiotics.loc[13] = [9030, 4]  # Aztreonam (Azactam)
antibiotics.loc[14] = [8127, 4]  # Meropenem (Meronem)
antibiotics.loc[15] = [9070, 2]  # Zilversulfadiazine (Flammazine)
antibiotics.loc[16] = [7208, 2]  # Erythromycine (Erythrocine)
antibiotics.loc[17] = [8546, 2]  # Claritromycine (Klacid)
antibiotics.loc[18] = [13057, 2]  # Azitromycine (Zithromax)
antibiotics.loc[19] = [6958, 2]  # Clindamycine (Dalacin)
antibiotics.loc[20] = [7044, 2]  # Tobramycine (Obracin)
antibiotics.loc[21] = [13094, 2]  # Tobramycine oogzalf (Tobrex)
antibiotics.loc[22] = [7235, 2]  # Gentamicine (Garamycin)
antibiotics.loc[23] = [9109, 2]  # Neomycine sulfaat
antibiotics.loc[24] = [6834, 2]  # Amikacine (Amukin)
antibiotics.loc[25] = [6948, 2]  # Ciprofloxacine (Ciproxin)
antibiotics.loc[26] = [9117, 2]  # Norfloxacine (Noroxin)
antibiotics.loc[27] = [12398, 2]  # Levofloxacine (Tavanic)
antibiotics.loc[28] = [12956, 2]  # Moxifloxacin (Avelox)
antibiotics.loc[29] = [7064, 2]  # Vancomycine
antibiotics.loc[30] = [8549, 4]  # Belcomycine (Colistinesulfaat) 4 x dgs
antibiotics.loc[31] = [10584, 4]  # Belcomycine (Colistinesulfaat) 6 x dgs
antibiotics.loc[32] = [20175, 4]  # Colistine
antibiotics.loc[33] = [20176, 4]  # Colistine Inhalatie
antibiotics.loc[34] = [8942, 1]  # Metronidazol-Flagyl
antibiotics.loc[35] = [7187, 1]  # Metronidazol (Flagyl)
antibiotics.loc[36] = [14236, 2]  # Nitrofurantoïne ( Furadantine)
antibiotics.loc[37] = [19137, 4]  # Linezolid (Zyvoxid)
antibiotics.loc[38] = [19773, 4]  # Daptomycine (Cubicin)
antibiotics.loc[39] = [8394, 1]  # Co-Trimoxazol (Bactrimel)
antibiotics.loc[40] = [9052, 1]  # Co-trimoxazol forte (Bactrimel)
antibiotics.loc[41] = [6919, 2]  # Cefotaxim (Claforan)
# The following are not included (various reasons, documented):
# 6932 Chlooramfenicol (Globenicol): cream/ointment
# 9070 Flucloxacilline (Stafoxil/Floxapen): cream/ointment
# 13102: Dexamethason/gentamicine oogzalf (Dexamytrex): eye drops
# 12997 Ofloxacine (Trafloxal) oogdruppels:eye drops
# 9075 Fusidinezuur (Fucidin): prophylactic after cardiothoracic surgery
# 13045 Fusidinezuur oogdruppels (Fusithalmic): eye drops

# As part of the selective digestive decontamination, patients expected to stay
# at least 24-48hrs in ICU receive 16 (at least between 10 and 20) doses of
# cefotaxime across 4 days. If the clinician suspects an infection, this should
# be switched to  ceftriaxone (which has a similar spectrum). If cefotaxime is
# continued after these initial doses, assume the clinician has suspected an
# infection and kept cefotaxime.
# Similarly, for cardiothoracic surgery patients, antibiotic usage will be
# prophylactic (vancomycin/fusidinezuur annd cefazolin).
# Fusidinezuur is always prophylactic, vancomycin prophylactic in day 1.
# Prophylactic antibiotic usage is picked up again later in the script.
admissions_add = pd.read_csv(inputs.data_file_path + 'admissions.csv', encoding='latin-1')
admissions_add = admissions_add[["admissionid","admittedat"]]
drugitems = pd.merge(drugitems, admissions_add, on='admissionid', how='left')
drugitems['start_time'] = (drugitems['start'] - drugitems['admittedat'])
drugitems['stop_time'] = (drugitems['stop'] - drugitems['admittedat'])
drugitems[['start_time', 'stop_time']] //= (1000*60*60*24)  # Convert to 'day'

drugitems_abx = drugitems.loc[drugitems['itemid'].isin(antibiotics['itemid'])]

drugitems_abx.loc[(
        (drugitems_abx['stop_time'] <= 4) &
        (drugitems_abx['itemid'] == 6919)),
    'itemid'] = np.nan
drugitems_abx.loc[(
        (drugitems_abx['stop_time'] <= 1) &
        (drugitems_abx['itemid'] == 7064)),
    'itemid'] = np.nan
drugitems_abx.dropna(subset=['itemid'], inplace=True)

drugitems_abx = pd.merge(
    drugitems_abx, antibiotics[['itemid', 'rank']], on='itemid', how='left')
drugitems_abx['intravenous'] = (
    drugitems_abx['ordercategoryid'].isin([15, 65, 55]))
# Sepsis-3 (Shah et al.) say antibiotics considered on during 24hr period if
# administration occurred within 24hr period or within 12hrs before 24hr period
drugitems_abx['start_time'] = (
    drugitems_abx['start'] - drugitems_abx['admittedat']) # - 1000*60*60*12
drugitems_abx['start_time'] //= (1000*60*60) #*24
# The same as before
drugitems_abx['stop_time'] = (
    drugitems_abx['stop'] - drugitems_abx['admittedat'])
drugitems_abx['stop_time'] //= (1000*60*60) #*24
# As with the SOFA cardiovascular score, we want to add extra rows to the
# dataframe, for when drug administration happened over consecutive 'days'
# (i.e. make an entry for each 'day' tha the drug administration window
# overlaps with)
n_days = drugitems_abx['stop_time'] - drugitems_abx['start_time'] + 1
drugitems_abx = drugitems_abx.loc[
    drugitems_abx.index.repeat(n_days)].reset_index(drop=True)
drugitems_abx['time'] = np.hstack([np.arange(x) for x in n_days])
drugitems_abx['time'] += drugitems_abx['start_time']

abx_cols = ['admissionid', 'time', 'intravenous', 'rank', 'item', 'itemid']
drugitems_abx = drugitems_abx[abx_cols]
drugitems_abx = drugitems_abx.loc[~drugitems_abx.duplicated()]

max_rank = drugitems_abx.groupby(['admissionid', 'time']).agg(
        max_rank=pd.NamedAgg(column='rank', aggfunc='max')
    ).reset_index()
drugitems_abx = pd.merge(
    drugitems_abx, max_rank, on=['admissionid', 'time'], how='left')
drugitems_abx['rank_diff'] = drugitems_abx['max_rank'] - drugitems_abx['rank']

# Now compute the antibiotic escalation from day to day. Antibiotic escalation
# occurs if a new drug of higher ranking is administered (i.e. the max_rank
# increases) or if the number of drugs at the highest ranking is increased
# (i.e. n_max_rank increases), when this is accompanied by at least one
# antibiotics given intravenously
abx_escalation = drugitems_abx.groupby(['admissionid', 'time']).agg(
        intravenous=pd.NamedAgg(column='intravenous', aggfunc='any'),
        max_rank=pd.NamedAgg(column='rank', aggfunc='max'),
        n_max_rank=pd.NamedAgg(
            column='rank_diff', aggfunc=lambda x: (x == 0).sum())
    ).reset_index()

abx_escalation['max_rank_increase'] = (abx_escalation['max_rank'].diff() > 0)
abx_escalation['n_max_rank_increase'] = (
    abx_escalation['n_max_rank'].diff() > 0)
abx_escalation['time_diff'] = abx_escalation['time'].diff()
# If two consecutive rows correspond to different patients, then we don't want
# antibiotics escalation defined from the first patient to the second!
abx_escalation.loc[(
        ~abx_escalation['admissionid'].duplicated()),
    ['max_rank_increase', 'n_max_rank_increase', 'time_diff']] = np.nan
abx_escalation['antibiotic_escalation'] = False
# Any new entry is assumed to be the first antibiotics given to that patient
abx_escalation.loc[(
        ~abx_escalation['admissionid'].duplicated()),
    'antibiotic_escalation'] = True
abx_escalation.loc[(
        (abx_escalation['max_rank_increase'] > 0) |
        (abx_escalation['n_max_rank_increase'] > 0) |
        (abx_escalation['time_diff'] != 1)),
    'antibiotic_escalation'] = True

abx_escalation.drop(
    columns=['max_rank_increase', 'n_max_rank_increase', 'time_diff'],
    inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [228]:
df_infection = abx_escalation
print(len(df_infection.admissionid.unique()))
df_infection.to_csv("./infection_time.csv",index=False)

16022


In [8]:
import pandas as pd
df_infection = pd.read_csv("./infection_time.csv")
max(df_infection.time.unique())

0

# sepsis条件2：sofa变化大于2

In [None]:
numerics_cols = ['admissionid', 'itemid', 'item', 'value', 'unitid']
numerics_cols += ['measuredat', 'registeredby', 'islabresult', 'fluidout']
numerics_dtypes = ['int64', 'int64', 'str', 'float64', 'int64']
numerics_dtypes += ['int64', 'str', 'bool', 'float64']
numerics_dtypes = dict(zip(numerics_cols, numerics_dtypes))
numerics_csv = dict(
    encoding='latin-1', usecols=numerics_cols, dtype=numerics_dtypes,
    chunksize=10**6)

def numerics_read(itemid_list, start=None, end=None, admissions_df=None):
    numerics_list = []
    ii = 0
    if admissions_df is not None:
        admissions_df = admissions_df[['admissionid', 'admittedat']]
    file_name =  '../AmsterdamUMCdb-v1/numericitems.csv'
    with pd.read_csv(file_name, **numerics_csv) as reader:
        for chunk in reader:
            if ((ii % 100) == 0):
                print(ii)
            chunk = chunk.loc[chunk['itemid'].isin(itemid_list)]
            if admissions_df is not None:
                chunk = pd.merge(
                    chunk, admissions_df,
                    on='admissionid', how='left')
                chunk['diff_measuredat'] = (
                    chunk['measuredat'] - chunk['admittedat'])
            else:
                chunk['diff_measuredat'] = chunk['measuredat']
            if start is not None:
                chunk = chunk.loc[chunk['diff_measuredat'] >= start]
            if end is not None:
                chunk = chunk.loc[chunk['diff_measuredat'] <= end]
            numerics_list.append(chunk)
            ii += 1
    numerics = pd.concat(numerics_list)
    numerics['time'] = numerics['diff_measuredat'] // (1000*60*60) #转换为小时
    numerics.drop(columns=['diff_measuredat'], inplace=True)
    numerics.reset_index(drop=True, inplace=True)
    return numerics

In [None]:
# This list is taken from the AmsterdamUMCdb SOFA scores script.
numerics_sofa_itemid = [8845]  # O2 l/min
numerics_sofa_itemid += [10387]  # Zuurstof toediening (bloed)
numerics_sofa_itemid += [18587]  # Zuurstof toediening
numerics_sofa_itemid += [6699]  # FiO2 %: setting on Evita ventilator
numerics_sofa_itemid += [12279]  # 12279 O2 concentratie Servo-i/Servo-U vent.
numerics_sofa_itemid += [12369]  # SET %O2: used with BiPap Vision ventilator
numerics_sofa_itemid += [16246]  # Zephyros FiO2: Non-invasive ventilation
numerics_sofa_itemid += [8794]  # UrineCAD
numerics_sofa_itemid += [8796]  # UrineSupraPubis
numerics_sofa_itemid += [8798]  # UrineSpontaan
numerics_sofa_itemid += [8800]  # UrineIncontinentie
numerics_sofa_itemid += [8803]  # UrineUP
numerics_sofa_itemid += [10743]  # Nefrodrain li Uit
numerics_sofa_itemid += [10745]  # Nefrodrain re Uit
numerics_sofa_itemid += [19921]  # UrineSplint Li
numerics_sofa_itemid += [19922]  # UrineSplint Re]
numerics_sofa_itemid += [6846]  # PCO2
numerics_sofa_itemid += [9990]  # pCO2 (bloed)
numerics_sofa_itemid += [21213]  # PCO2 (bloed) - kPa
numerics_sofa_itemid += [7433]  # PO2
numerics_sofa_itemid += [9996]  # PO2 (bloed)
numerics_sofa_itemid += [21214]  # PO2 (bloed) - kPa
numerics_sofa_itemid += [9964]  # Thrombo's (bloed)
numerics_sofa_itemid += [6797]  # Thrombocyten
numerics_sofa_itemid += [10409]  # Thrombo's citr. bloed (bloed)
numerics_sofa_itemid += [14252]  # Thrombo CD61 (bloed)
numerics_sofa_itemid += [6813]  # Bili Totaal
numerics_sofa_itemid += [9945]  # Bilirubine (bloed)

numerics_abp_itemid = [6642]  # ABP gemiddeld
numerics_abp_itemid += [6679]  # Niet invasieve bloeddruk gemiddeld
numerics_abp_itemid += [8843]  # ABP gemiddeld II

numerics_creatinine_itemid = [6836]  # 6836: Kreatinine µmol/l ...
# (erroneously documented as µmol)
numerics_creatinine_itemid += [9941]  # Kreatinine (bloed) µmol/l
numerics_creatinine_itemid += [14216]  # KREAT enzym. (bloed) µmol/l


In [None]:
end_time = None # 入ICU后14天的数据
start_time = 0
numerics_sofa = numerics_read(
    numerics_sofa_itemid + numerics_abp_itemid, admissions_df=None,
    end=end_time, start=-1000*60*60*24*1)
# We need a baseline creatinine, so look back further.
numerics_creatinine = numerics_read(
    numerics_creatinine_itemid, admissions_df=None,
    end=end_time, start=-1000*60*60*24*365)

#numerics_creatinine.to_csv("sepsis_for_creatine.csv",index=False)
#numerics_sofa.to_csv("sepsis_for_sofa.csv",index=False)

In [4]:
numerics_creatinine = pd.read_csv("sepsis_for_creatine.csv")
numerics_sofa = pd.read_csv("sepsis_for_sofa.csv")

## Respiration score

In [5]:
# Respiration score

# Get PaO2/FiO2 ratio
oxy_flow_listitems = listitems.loc[
    (listitems['itemid'] == 8189),
    ['admissionid', 'valueid', 'value', 'measuredat', 'time']]
oxy_dev = numerics_sofa.loc[numerics_sofa['itemid'].isin([8845, 10387, 18587])]
oxy_flow = pd.merge(
    oxy_flow_listitems, oxy_dev[['admissionid', 'measuredat', 'value']],
    on=['admissionid', 'measuredat'], how='left')
oxy_flow.rename(columns={'value_x': 'O2_device'}, inplace=True)
oxy_flow.rename(columns={'value_y': 'O2_flow'}, inplace=True)
oxy_flow.head()

# Get PaO2 and FiO2 values
# Simultaneously retrieve PaCO2 and the 'nearest' FiO2 from the ventilator or
# estimated FiO2 based on applied oxygen device. Ideally documentation of
# measurements should be at the same time, but since this is not guaranteed
# allow a window.
# In more recent data PaCO2 and PaO2 were documented in kPa instead of mmHg.
fio2_itemid = [8845, 10387, 18587, 6699, 12279, 12369, 16246]
fio2_table = numerics_sofa.loc[
    (numerics_sofa['value'] > 0) &
    (numerics_sofa['itemid'].isin(fio2_itemid))]
fio2_table = pd.merge(
    fio2_table, oxy_flow_listitems.drop(columns=['time']),
    on=['admissionid', 'measuredat'], how='left')
fio2_table.rename(columns={'value_y': 'O2_device'}, inplace=True)
fio2_table.rename(columns={'value_x': 'value'}, inplace=True)
fio2_table['ventilatory_support'] = False
fio2_table.loc[(
        fio2_table['itemid'].isin([6699, 12279, 12369, 16246])),
    'ventilatory_support'] = True

fio2_table['fio2'] = 0.21
fio2_ind = fio2_table['ventilatory_support'] & (~fio2_table['value'].isnull())
fio2_table.loc[fio2_ind, 'fio2'] = fio2_table.loc[fio2_ind, 'value']

valueid1 = [1]  # Diep Nasaal
valueid1 += [2]  # Nasaal
valueid1 += [3]  # Kapje
valueid1 += [4]  # Kunstneus
valueid1 += [7]  # O2-bril
valueid1 += [8]  # Kinnebak
valueid1 += [9]  # Nebulizer
valueid1 += [18]  # Spreekcanule
valueid1 += [19]  # Spreekklepje
fio2_ind1 = (
    (~fio2_table['ventilatory_support']) &
    (fio2_table['valueid'].isin(valueid1)))
fio2_table.loc[(
        fio2_ind1 & (fio2_table['value'] >= 1) & (fio2_table['value'] < 2)),
    'fio2'] = 0.22
fio2_table.loc[(
        fio2_ind1 & (fio2_table['value'] >= 2) & (fio2_table['value'] < 3)),
    'fio2'] = 0.25
fio2_table.loc[(
        fio2_ind1 & (fio2_table['value'] >= 3) & (fio2_table['value'] < 4)),
    'fio2'] = 0.27
fio2_table.loc[(
        fio2_ind1 & (fio2_table['value'] >= 4) & (fio2_table['value'] < 5)),
    'fio2'] = 0.30
fio2_table.loc[fio2_ind1 & (fio2_table['value'] >= 5), 'fio2'] = 0.35

valueid2 = [1]  # Diep Nasaal
valueid2 += [3]  # Kapje
valueid2 += [4]  # Kunstneus
valueid2 += [8]  # Kinnebak
valueid2 += [9]  # Nebulizer
valueid2 += [18]  # Spreekcanule
valueid2 += [19]  # Spreekklepje
fio2_ind2 = (
    (~fio2_table['ventilatory_support']) &
    (fio2_table['valueid'].isin(valueid2)))
fio2_table.loc[(
        fio2_ind2 & (fio2_table['value'] >= 6) & (fio2_table['value'] < 7)),
    'fio2'] = 0.40
fio2_table.loc[(
        fio2_ind2 & (fio2_table['value'] >= 7) & (fio2_table['value'] < 8)),
    'fio2'] = 0.45
fio2_table.loc[fio2_ind2 & (fio2_table['value'] >= 8), 'fio2'] = 0.50

valueid3 = [10]  # Waterset
valueid3 += [11]  # Trach.stoma
valueid3 += [13]  # Ambu
valueid3 += [14]  # Guedel
valueid3 += [15]  # DL-tube
valueid3 += [16]  # CPAP
valueid3 += [17]  # Non-Rebreathing masker
fio2_ind3 = (
    (~fio2_table['ventilatory_support']) &
    (fio2_table['valueid'].isin(valueid3)))
fio2_table.loc[(
        fio2_ind3 & (fio2_table['value'] >= 6) & (fio2_table['value'] < 7)),
    'fio2'] = 0.60
fio2_table.loc[(
        fio2_ind3 & (fio2_table['value'] >= 7) & (fio2_table['value'] < 8)),
    'fio2'] = 0.70
fio2_table.loc[(
        fio2_ind3 & (fio2_table['value'] >= 8) & (fio2_table['value'] < 9)),
    'fio2'] = 0.80
fio2_table.loc[(
        fio2_ind3 & (fio2_table['value'] >= 9) & (fio2_table['value'] < 10)),
    'fio2'] = 0.85
fio2_table.loc[fio2_ind3 & (fio2_table['value'] >= 10), 'fio2'] = 0.90
fio2_table.rename(columns={'measuredat': 'fio2_measuredat'}, inplace=True)
fio2_cols = ['admissionid', 'fio2_measuredat', 'fio2']
fio2_cols += ['ventilatory_support', 'time']

# This is initially pao2 and then merged with fio2 from above.
po2_itemid = [7433]  # PO2
po2_itemid += [9996]  # PO2 (bloed)
po2_itemid += [21214]  # PO2 (bloed) - kPa
oxygenation_po2 = numerics_sofa.loc[numerics_sofa['itemid'].isin(po2_itemid)]
# Conversion from kPa to mmHg
oxygenation_po2.loc[oxygenation_po2['unitid'] == 152, 'value'] *= 7.50061683
oxygenation_po2.rename(columns={'value': 'pao2'}, inplace=True)
#oxygenation_po2['manual_entry'] = True
#oxygenation_po2.loc[systeem_flag(oxygenation_po2), 'manual_entry'] = False

f = freetextitems.loc[freetextitems['itemid'] == 11646]
f = f[['admissionid', 'measuredat', 'value']]
f.rename(columns={'value': 'specimen_source'}, inplace=True)
oxygenation_po2_f = pd.merge(
    oxygenation_po2, f, on=['admissionid', 'measuredat'], how='left')

oxygenation_po2_f = oxygenation_po2_f.loc[
    oxygenation_po2_f['specimen_source'].isnull() |
    oxygenation_po2_f['specimen_source'].str.contains(
        'art', flags=re.IGNORECASE)]

oxygenation = pd.merge(
    oxygenation_po2_f, fio2_table[fio2_cols],
    on=['admissionid', 'time'], how='left')
oxygenation['FiO2_time_difference'] = (
    oxygenation['fio2_measuredat'] - oxygenation['measuredat'])
# Keep fio2 only if no earlier than 60 minutes before pao2 measurement
oxygenation = oxygenation.loc[(
    oxygenation['FiO2_time_difference'] > -1000*60*60)]
# and no later than 15 minutes after pao2 measuremen
oxygenation = oxygenation.loc[oxygenation['FiO2_time_difference'] < 1000*60*15]
# Convert to days (not discrete)
oxygenation['FiO2_time_difference'] /= (1000*60*60)
oxygenation['abs_measuredat'] = oxygenation['FiO2_time_difference'].abs()

# Sort by the smallest fio2 time difference for each patient and timestamp
oxygenation = oxygenation.sort_values(
    by=['admissionid', 'measuredat', 'abs_measuredat'])
# Discard duplicates of the same patient and timestamp (keeping only the
# smallest fio2 time difference)
oxygenation = oxygenation.loc[
    ~(oxygenation[['admissionid', 'measuredat']].duplicated())]
oxygenation['priority'] = 1

sofa_resp_cols = ['admissionid', 'pao2', 'specimen_source']
sofa_resp_cols += ['time', 'fio2', 'ventilatory_support']
sofa_resp_cols += ['FiO2_time_difference', 'priority']
sofa_respiration = oxygenation[sofa_resp_cols]
sofa_respiration.head()

# Remove extreme outliers (in the AmsterdamUMCdb script, histograms are plotted
# to identify these outliers by eye, here we just copy those values)
sofa_respiration.loc[(sofa_respiration['fio2'] > 100), 'fio2'] = np.nan
# Convert FiO2 in % to fraction
sofa_respiration.loc[(
        (sofa_respiration['fio2'] <= 100) &
        (sofa_respiration['fio2'] >= 20)),
    'fio2'] /= 100
# Remove lower outliers, most likely incorrectly labeled as 'arterial' instead
# of '(mixed/central) venous'
sofa_respiration.loc[sofa_respiration['pao2'] < 50, 'pao2'] = np.nan
sofa_respiration = sofa_respiration.dropna(subset=['pao2'])

# Get the PF ratio
sofa_respiration.loc[:, 'pf_ratio'] = (
    sofa_respiration['pao2'] / sofa_respiration['fio2'])
# Some entries may be 'None', need to make these False instead
sofa_respiration['ventilatory_support'].fillna(False, inplace=True)

# Calculate SOFA respiration score:
sofa_respiration['sofa_respiration_score'] = 0
sofa_respiration.loc[(
        (sofa_respiration['pf_ratio'] < 400) &
        (sofa_respiration['pf_ratio'] >= 300)),
    'sofa_respiration_score'] = 1
sofa_respiration.loc[(
        sofa_respiration['pf_ratio'] < 300),
    'sofa_respiration_score'] = 2
sofa_respiration.loc[(
        (sofa_respiration['pf_ratio'] < 200) &
        (sofa_respiration['pf_ratio'] >= 100) &
        sofa_respiration['ventilatory_support']),
    'sofa_respiration_score'] = 3
sofa_respiration.loc[(
        (sofa_respiration['pf_ratio'] < 100) &
        sofa_respiration['ventilatory_support']),
    'sofa_respiration_score'] = 4

sofa_respiration.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,admissionid,pao2,specimen_source,time,fio2,ventilatory_support,FiO2_time_difference,priority,pf_ratio,sofa_respiration_score
0,0,90.0,,5,0.5,True,0.0,1,180.0,3
1,0,149.0,,7,0.51,True,0.0,1,292.156863,2
2,0,104.0,,9,0.41,True,0.0,1,253.658537,2
4,0,105.0,,12,0.41,True,0.0,1,256.097561,2
5,0,90.0,,15,0.41,True,-0.25,1,219.512195,2


## Coagulation score

In [6]:
# Coagulation score (platelets (thrombocytes))

platelet_itemid = [9964]  # Thrombo's (bloed)
platelet_itemid += [6797]  # Thrombocyten
platelet_itemid += [10409]  # Thrombo's citr. bloed (bloed)
platelet_itemid += [14252]  # Thrombo CD61 (bloed)
sofa_platelets = numerics_sofa.loc[
    numerics_sofa['itemid'].isin(platelet_itemid)]
#sofa_platelets['manual_entry'] = True
#sofa_platelets.loc[systeem_flag(sofa_platelets), 'manual_entry'] = False
sofa_platelets_columns = ['admissionid', 'itemid', 'item', 'value']
sofa_platelets_columns += ['registeredby', 'time'] #, 'manual_entry'
sofa_platelets = sofa_platelets[sofa_platelets_columns]

# Calculate SOFA coagulation score:
sofa_platelets['sofa_coagulation_score'] = 0
sofa_platelets.loc[(
        (sofa_platelets['value'] < 150) & (sofa_platelets['value'] >= 100)),
    'sofa_coagulation_score'] = 1
sofa_platelets.loc[(
        (sofa_platelets['value'] < 100) & (sofa_platelets['value'] >= 50)),
    'sofa_coagulation_score'] = 3
sofa_platelets.loc[(
        (sofa_platelets['value'] < 50) & (sofa_platelets['value'] >= 20)),
    'sofa_coagulation_score'] = 3
sofa_platelets.loc[(
        sofa_platelets['value'] < 20),
    'sofa_coagulation_score'] = 4

sofa_platelets.head()

Unnamed: 0,admissionid,itemid,item,value,registeredby,time,sofa_coagulation_score
94,0,9964,Thrombo's (bloed),297.0,Systeem,-9,0
95,0,9964,Thrombo's (bloed),150.0,Systeem,4,0
96,0,9964,Thrombo's (bloed),171.0,Systeem,6,0
97,0,9964,Thrombo's (bloed),206.0,Systeem,12,0
98,0,9964,Thrombo's (bloed),143.0,Systeem,36,1


##  Liver score

In [7]:
# Liver score (bilirubin)

bilirubin_itemid = [6813]
bilirubin_itemid += [9945]
sofa_bilirubin = numerics_sofa.loc[(
    numerics_sofa['itemid'].isin(bilirubin_itemid))]
#sofa_bilirubin['manual_entry'] = True
#sofa_bilirubin.loc[systeem_flag(sofa_bilirubin), 'manual_entry'] = False
sofa_bilirubin_columns = ['admissionid', 'itemid', 'item', 'value']
sofa_bilirubin_columns += ['registeredby', 'time'] # 'manual_entry',
sofa_bilirubin = sofa_bilirubin[sofa_bilirubin_columns]

# Calculate SOFA liver score:
sofa_bilirubin['sofa_liver_score'] = 0
sofa_bilirubin.loc[(
        (sofa_bilirubin['value'] >= 20) & (sofa_bilirubin['value'] < 33)),
    'sofa_liver_score'] = 1
sofa_bilirubin.loc[(
        (sofa_bilirubin['value'] >= 33) & (sofa_bilirubin['value'] < 102)),
    'sofa_liver_score'] = 2
sofa_bilirubin.loc[(
        (sofa_bilirubin['value'] >= 102) & (sofa_bilirubin['value'] < 204)),
    'sofa_liver_score'] = 3
sofa_bilirubin.loc[(
        sofa_bilirubin['value'] >= 204),
    'sofa_liver_score'] = 4

sofa_bilirubin.head()

Unnamed: 0,admissionid,itemid,item,value,registeredby,time,sofa_liver_score
3131,4,9945,Bilirubine (bloed),8.0,Systeem,1,0
3132,4,9945,Bilirubine (bloed),7.0,Systeem,41,0
5647,5,9945,Bilirubine (bloed),5.0,Systeem,0,0
5648,5,9945,Bilirubine (bloed),3.0,Systeem,38,0
8412,6,9945,Bilirubine (bloed),18.0,Systeem,0,0


## Cardiovascular score

In [26]:
# Cardiovascular score

cv_drug_itemid = [7179]  # Dopamine (Inotropin)
cv_drug_itemid += [7178]  # Dobutamine (Dobutrex)
cv_drug_itemid += [6818]  # Adrenaline (Epinefrine)
cv_drug_itemid += [7229]  # Noradrenaline (Norepinefrine)

sofa_cardiovascular = drugitems.loc[(
    drugitems['itemid'].isin(cv_drug_itemid) &
    (drugitems['rate'] > 0.1) &
    (drugitems['ordercategoryid'] == 65))]

admissions_add = admissions_df[['admissionid', 'admittedat', 'weightgroup']]
sofa_cardiovascular = pd.merge(
    sofa_cardiovascular, admissions_add, on='admissionid', how='left')

weight_group_dict = {
    '59-': 55, '60-69': 65, '70-79': 75, '80-89': 85, '90-99': 95,
    '100-109': 105, '110+': 115, np.nan: 80}
sofa_cardiovascular['patientweight'] = (
    sofa_cardiovascular['weightgroup'].replace(weight_group_dict))

# Want to add extra rows to the dataframe, for when drug administration
# happened over consecutive 'days' (i.e. make an entry for each 'day' that
# the drug administration window overlaps with)
n_hour = sofa_cardiovascular['stop'] - sofa_cardiovascular['start']
n_hour += 1
sofa_cardiovascular = sofa_cardiovascular.loc[
    sofa_cardiovascular.index.repeat(n_hour)
].reset_index(drop=True)
sofa_cardiovascular['time'] = np.hstack([np.arange(x) for x in n_hour])
sofa_cardiovascular['time'] += sofa_cardiovascular['start']
#sofa_cardiovascular = sofa_cardiovascular.loc[(sofa_cardiovascular['time'] <= MAX_TIME)]
sofa_cardiovascular.drop(
    columns=['ordercategoryid', 'weightgroup'], inplace=True) # 'start', 'stop',
sofa_cardiovascular.head()

# Calculate gamma, as per AmsterdamUMCdb script
sofa_cardiovascular['gamma'] = (sofa_cardiovascular['dose'] / 80)
valid_weight_ind = sofa_cardiovascular['patientweight'] > 0
sofa_cardiovascular.loc[valid_weight_ind, 'gamma'] = (
    sofa_cardiovascular.loc[valid_weight_ind, 'dose'] /
    sofa_cardiovascular.loc[valid_weight_ind, 'patientweight'])
sofa_cardiovascular.loc[sofa_cardiovascular['doserateperkg'] == 1, 'gamma'] = (
    sofa_cardiovascular.loc[sofa_cardiovascular['doserateperkg'] == 1, 'dose'])
sofa_cardiovascular.loc[(
        sofa_cardiovascular['doseunitid'] == 10),
    'gamma'] *= 1000
sofa_cardiovascular.loc[(
        sofa_cardiovascular['doserateunitid'] == 5),
    'gamma'] /= 60

# Mean ABP
mean_abp = numerics_sofa.loc[numerics_sofa['itemid'].isin(numerics_abp_itemid)]
#mean_abp['validated'] = True
#mean_abp.loc[mean_abp['registeredby'].isnull(), 'validated'] = False
mean_abp.head()

# Remove extreme outliers, most likely data entry errors or measurement errors
mean_abp.loc[(mean_abp['value'] > 165), 'value'] = np.nan
mean_abp.loc[(mean_abp['value'] <= 30), 'value'] = np.nan
mean_abp_cols = ['admissionid', 'itemid', 'item', 'value', 'time']
mean_abp = mean_abp[mean_abp_cols]
mean_abp = mean_abp.dropna(subset=['value'])
# Use mean_abp 'cleansed' dataframe
cv_groupby_cols = ['admissionid', 'itemid', 'item', 'time']
sofa_cardiovascular_map = mean_abp.groupby(cv_groupby_cols).agg(
        lowest_mean_abp=pd.NamedAgg(column='value', aggfunc='min')
    ).reset_index()

# Calculate SOFA cardiovascular score:
sofa_cardiovascular_map['sofa_cardiovascular_score'] = 0
# MAP < 70
sofa_cardiovascular_map.loc[(
        sofa_cardiovascular_map['lowest_mean_abp'] < 70),
    'sofa_cardiovascular_score'] = 1
sofa_cardiovascular_map.head()

sofa_cardiovascular_meds = sofa_cardiovascular.groupby(cv_groupby_cols).agg(
        total_duration=pd.NamedAgg(column='duration', aggfunc='sum'),
        max_gamma=pd.NamedAgg(column='gamma', aggfunc='max')
    ).reset_index()
sofa_cardiovascular_meds.head()

sofa_cardiovascular_meds['sofa_cardiovascular_score'] = 0
# Dopamine (itemid 7179) <= 5 or dobutamine (itemid 7178) any dose
sofa_cardiovascular_meds.loc[(
        ((sofa_cardiovascular_meds['itemid'] == 7179) &
            (sofa_cardiovascular_meds['max_gamma'] <= 5)) |
        (sofa_cardiovascular_meds['itemid'] == 7178)),
    'sofa_cardiovascular_score'] = 2
# Dopamine (itemid 7179) > 5, epinephrine (itemid 6818) <= 0.1,
# norepinephrine (itemid 7229) <= 0.1
sofa_cardiovascular_meds.loc[(
        ((sofa_cardiovascular_meds['itemid'] == 7179) &
            (sofa_cardiovascular_meds['max_gamma'] > 5) &
            (sofa_cardiovascular_meds['max_gamma'] < 15)) |
        ((sofa_cardiovascular_meds['itemid'] == 6818) &
            (sofa_cardiovascular_meds['max_gamma'] <= 0.1)) |
        ((sofa_cardiovascular_meds['itemid'] == 7229) &
            (sofa_cardiovascular_meds['max_gamma'] <= 0.1))),
    'sofa_cardiovascular_score'] = 3
# Dopamine (itemid 7179) > 15, epinephrine (itemid 6818) > 0.1,
# norepinephrine (itemid 7229) > 0.1
sofa_cardiovascular_meds.loc[(
        ((sofa_cardiovascular_meds['itemid'] == 7179) &
            (sofa_cardiovascular_meds['max_gamma'] > 15)) |
        ((sofa_cardiovascular_meds['itemid'] == 6818) &
            (sofa_cardiovascular_meds['max_gamma'] > 0.1)) |
        ((sofa_cardiovascular_meds['itemid'] == 7229) &
            (sofa_cardiovascular_meds['max_gamma'] > 0.1))),
    'sofa_cardiovascular_score'] = 4
sofa_cardiovascular_meds.head()

# Combine the scores from MAP and cardiovascular medication
sofa_cardiovascular = pd.concat(
        [sofa_cardiovascular_map, sofa_cardiovascular_meds], sort=False,
    ).sort_values(by=['admissionid', 'time']).reset_index(drop=True)

sofa_cardiovascular.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,admissionid,itemid,item,time,lowest_mean_abp,sofa_cardiovascular_score,total_duration,max_gamma
0,0,6642,ABP gemiddeld,5,68.0,1,,
1,0,6642,ABP gemiddeld,6,83.0,0,,
2,0,6642,ABP gemiddeld,7,75.0,0,,
3,0,6642,ABP gemiddeld,8,70.0,0,,
4,0,6642,ABP gemiddeld,9,73.0,0,,


## Glasgow Coma Scale score

In [9]:
# Glasgow Coma Scale score

eyes_itemids = [6732]  # Actief openen van de ogen
eyes_itemids += [13077]  # A_Eye
eyes_itemids += [14470]  # RA_Eye
eyes_itemids += [16628]  # MCA_Eye
eyes_itemids += [19635]  # E_EMV_NICE_24uur
eyes_itemids += [19638]  # E_EMV_NICE_Opname
motor_itemids = [6734]  # Beste motore reactie van de armen
motor_itemids += [13072]  # A_Motoriek
motor_itemids += [14476]  # RA_Motoriek
motor_itemids += [16634]  # MCA_Motoriek
motor_itemids += [19636]  # M_EMV_NICE_24uur
motor_itemids += [19639]  # M_EMV_NICE_Opname
verbal_itemids = [6735]  # Beste verbale reactie
verbal_itemids += [13066]  # A_Verbal
verbal_itemids += [14482]  # RA_Verbal
verbal_itemids += [16640]  # MCA_Verbal
verbal_itemids += [19637]  # V_EMV_NICE_24uur
verbal_itemids += [19640]  # V_EMV_NICE_Opname

# GCS eyes component
gcs_components = listitems.loc[listitems['itemid'].isin(eyes_itemids)]
gcs_components['eyes_score'] = 0
# Actief openen van de ogen
gcs_components.loc[gcs_components['itemid'] == 6732, 'eyes_score'] = (
    5 - gcs_components.loc[gcs_components['itemid'] == 6732, 'valueid'])
# A_Eye
gcs_components.loc[gcs_components['itemid'] == 13077, 'eyes_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 13077, 'valueid'])
# RA_Eye, MCA_Eye, E_EMV_NICE_24uur
gcs_components.loc[
        gcs_components['itemid'].isin([14470, 16628, 19635]), 'eyes_score'] = (
    gcs_components.loc[
        gcs_components['itemid'].isin([14470, 16628, 19635]), 'valueid'] - 4)
# E_EMV_NICE_Opname
gcs_components.loc[gcs_components['itemid'] == 19638, 'eyes_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 19638, 'valueid'] - 8)

# Preference, ranked by discipline
gcs_components['preference'] = 8
gcs_preferences = {
    'ICV_Medisch Staflid': 1,
    'ICV_Medisch': 2,
    'ANES_Anesthesiologie': 3,
    'ICV_Physician assistant': 4,
    'ICH_Neurochirurgie': 5,
    'ICV_IC-Verpleegkundig': 6,
    'ICV_MC-Verpleegkundig': 7}
#gcs_ind = gcs_components['registeredby'].isin(gcs_preferences.keys())
#gcs_components.loc[gcs_ind, 'preference'] = (gcs_components.loc[gcs_ind, 'registeredby'].replace(gcs_preferences))
gcs_components.sort_values(by=['admissionid', 'time', 'eyes_score'], inplace=True) # 'preference',

# Only keep the lowest score for the discipline of smallest rank
# (i.e. ICV_Medisch Staflid supercedes ICV_Medisch etc.)
gcs_components = gcs_components.loc[
    ~gcs_components[['admissionid', 'time']].duplicated()]
gcs_components.drop(
    columns=['itemid', 'valueid', 'value'], #, 'registeredby'
    inplace=True)

gcs_cols = ['admissionid', 'measuredat', 'itemid', 'valueid'] #, 'registeredby'
# Add GCS motor score
gcs_components = pd.merge(
    gcs_components,
    listitems.loc[listitems['itemid'].isin(motor_itemids), gcs_cols],
    on=['admissionid', 'measuredat'],
    how='left')
gcs_components['motor_score'] = 0
# 6734 Beste motore reactie van de armen
gcs_components.loc[gcs_components['itemid'] == 6734, 'motor_score'] = (
    7 - gcs_components.loc[gcs_components['itemid'] == 6734, 'valueid'])
# 13072 A_Motoriek
gcs_components.loc[gcs_components['itemid'] == 13072, 'motor_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 13072, 'valueid'])
m_itemid = [14476]  # RA_Motoriek
m_itemid += [16634]  # MCA_Motoriek
m_itemid += [19636]  # M_EMV_NICE_24uur
gcs_components.loc[gcs_components['itemid'].isin(m_itemid), 'motor_score'] = (
    gcs_components.loc[gcs_components['itemid'].isin(m_itemid), 'valueid'] - 6)
# 19639 M_EMV_NICE_Opname
gcs_components.loc[gcs_components['itemid'] == 19639, 'motor_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 19639, 'valueid'] - 12)

# As above, add in preference by discipline
gcs_components['preference'] = 8
#gcs_ind = gcs_components['registeredby'].isin(gcs_preferences.keys())
#gcs_components.loc[gcs_ind, 'preference'] = (gcs_components.loc[gcs_ind, 'registeredby'].replace(gcs_preferences))
# Give higher preference by discipline (eye score should be the same for each
# admission id+time here)
gcs_components.sort_values(
    by=['admissionid', 'time', 'motor_score'], inplace=True) # 'preference',
# Only keep the highest score for the discipline of smallest rank
gcs_components = gcs_components.loc[(
    ~gcs_components[['admissionid', 'time']].duplicated())]
gcs_components.drop(
    columns=['itemid', 'valueid'], inplace=True) #, 'registeredby'
# Motor score is a float (due to pandas merge, so convert to int)
gcs_components['motor_score'] = gcs_components['motor_score'].astype(int)

# Add GCS verbal score
gcs_components = pd.merge(
    gcs_components,
    listitems.loc[listitems['itemid'].isin(verbal_itemids), gcs_cols],
    on=['admissionid', 'measuredat'],
    how='left')
gcs_components['verbal_score'] = 0
# 6735 Beste verbale reactie
gcs_components.loc[gcs_components['itemid'] == 6735, 'verbal_score'] = (
    6 - gcs_components.loc[gcs_components['itemid'] == 6735, 'valueid'])
# 13066 A_Verbal
gcs_components.loc[gcs_components['itemid'] == 13066, 'verbal_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 13066, 'valueid'])
v_itemid = [14482]  # RA_Verbal
v_itemid += [16640]  # MCA_Verbal
gcs_components.loc[gcs_components['itemid'].isin(v_itemid), 'verbal_score'] = (
    gcs_components.loc[gcs_components['itemid'].isin(v_itemid), 'valueid'] - 5)
# 19637 V_EMV_NICE_24uur
gcs_components.loc[gcs_components['itemid'] == 19637, 'verbal_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 19637, 'valueid'] - 9)
# 19640 V_EMV_NICE_Opname
gcs_components.loc[gcs_components['itemid'] == 19640, 'verbal_score'] = (
    gcs_components.loc[gcs_components['itemid'] == 19640, 'valueid'] - 15)

# As above, add in preference by discipline
gcs_components['preference'] = 8
gcs_ind = gcs_components['registeredby'].isin(gcs_preferences.keys())
gcs_components.loc[gcs_ind, 'preference'] = (
    gcs_components.loc[gcs_ind, 'registeredby'].replace(gcs_preferences))
# Give higher preference by discipline
gcs_components.sort_values(
    by=['admissionid', 'time', 'preference', 'verbal_score'], inplace=True)
# Only keep the highest score for the discipline of smallest rank
gcs_components = gcs_components.loc[(
    ~gcs_components[['admissionid', 'time']].duplicated())]
gcs_components.drop(
    columns=['itemid', 'valueid'], inplace=True) #, 'registeredby'
# Verbal score minimum is 1
gcs_components.loc[gcs_components['verbal_score'] < 1, 'verbal_score'] = 1
gcs_components['verbal_score'] = gcs_components['verbal_score'].astype(int)

# Combine the scores for total GCS score
gcs_components['min_gcs'] = (
    gcs_components['eyes_score'] +
    gcs_components['motor_score'] +
    gcs_components['verbal_score'])
gcs_components.head()
sofa_cns = gcs_components[['admissionid', 'time', 'min_gcs']]

# Calculate SOFA cardiovascular score:
sofa_cns['sofa_cns_score'] = 0
# MAP < 70
sofa_cns.loc[(
        (sofa_cns['min_gcs'] >= 13) & (sofa_cns['min_gcs'] < 15)),
    'sofa_cns_score'] = 1
sofa_cns.loc[(
        (sofa_cns['min_gcs'] >= 10) & (sofa_cns['min_gcs'] < 13)),
    'sofa_cns_score'] = 2
sofa_cns.loc[(
        (sofa_cns['min_gcs'] >= 6) & (sofa_cns['min_gcs'] < 10)),
    'sofa_cns_score'] = 3
sofa_cns.loc[(sofa_cns['min_gcs'] < 6), 'sofa_cns_score'] = 4

sofa_cns.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gcs_components['eyes_score'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gcs_components['preference'] = 8
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the 

Unnamed: 0,admissionid,time,min_gcs,sofa_cns_score
0,0,12,15,0
2,1,22,15,0
3,2,2,15,0
4,2,20,15,0
5,4,1,3,4


## renal sofa

In [19]:
# Renal score
# Get urineoutput
sofa_ruo_itemid = [8794]
sofa_ruo_itemid += [8796]
sofa_ruo_itemid += [8798]
sofa_ruo_itemid += [8800]
sofa_ruo_itemid += [8803]
sofa_ruo_itemid += [10743]
sofa_ruo_itemid += [10745]
sofa_ruo_itemid += [19921]
sofa_ruo_itemid += [19922]
# Dataframe is called sofa_renal_urine_output in amsterdamUMCdb sofa script
sofa_urine_output = numerics_sofa.loc[
    numerics_sofa['itemid'].isin(sofa_ruo_itemid)]
sofa_urine_output.drop(
    columns=['unitid', 'measuredat',  'fluidout'],
    inplace=True)
sofa_urine_output.head()

# Probably decimal error when entering volumes > 2500
sofa_urine_output.loc[(sofa_urine_output['value'] > 2500), 'value'] /= 10
# Remove extreme outliers, most likely data entry error)
sofa_urine_output.loc[(sofa_urine_output['value'] > 4500), 'value'] = np.nan
sofa_urine_output = sofa_urine_output.dropna(subset=['value'])

sofa_hourly_urine_output = \
    sofa_urine_output.groupby(['admissionid', 'time']).agg(
        hourly_urine_output=pd.NamedAgg(column='value', aggfunc='sum')
    ).reset_index()

#添加入院时间
tmp_df = pd.read_csv(inputs.data_file_path + 'admissions.csv', encoding='latin-1')
tmp_df = tmp_df[["admissionid","admittedat"]]
tmp_df["admittedat"] = tmp_df["admittedat"]/(1000*60*60)
sofa_hourly_urine_output = pd.merge(sofa_hourly_urine_output,tmp_df,how="left",on="admissionid")

sofa_hourly_urine_output.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Unnamed: 0,admissionid,time,hourly_urine_output,admittedat
0,0,5,90.0,0.0
1,0,6,310.0,0.0
2,0,7,360.0,0.0
3,0,8,180.0,0.0
4,0,9,120.0,0.0


In [20]:
# ## -- 填补时间 -- ##
com_lst = []
for i in sofa_hourly_urine_output.admissionid.unique().tolist():
    start_t = int(max(sofa_hourly_urine_output[sofa_hourly_urine_output.admissionid==i].admittedat))
    max_t = max(sofa_hourly_urine_output[sofa_hourly_urine_output.admissionid==i].time)
    for j in range(start_t,max_t+1,1):
        com_lst.append([i,j])
df_complete = pd.DataFrame(com_lst,columns=["admissionid", "time"])            
sofa_hourly_urine_output_add = pd.merge(df_complete,sofa_hourly_urine_output, how="left", on=["admissionid","time"])
sofa_hourly_urine_output_add = sofa_hourly_urine_output_add.fillna(0)
sofa_hourly_urine_output_add = sofa_hourly_urine_output_add.loc[(sofa_hourly_urine_output_add["time"]-sofa_hourly_urine_output_add["admittedat"])<90,:]
sofa_hourly_urine_output_add.head(1)

Unnamed: 0,admissionid,time,hourly_urine_output,admittedat
0,0,0,0.0,0.0


In [21]:
## 获取过去24小时的尿量 ##
tmp_df = pd.DataFrame()
for i,j in sofa_hourly_urine_output_add.groupby("admissionid"):
    j["daily_urine_output"] = j["hourly_urine_output"].rolling(window=24,min_periods=1).sum()
    tmp_df = pd.concat([tmp_df,j],axis=0)
sofa_daily_urine_output = tmp_df

In [22]:
# Calculate SOFA renal score for urine output:
sofa_daily_urine_output['sofa_renal_score'] = 0
# Urine output < 500 ml/day
sofa_daily_urine_output.loc[(
    (sofa_daily_urine_output['time'] > 11) &
        (sofa_daily_urine_output['daily_urine_output'] < 500) &
        (sofa_daily_urine_output['daily_urine_output'] > 200)),
     'sofa_renal_score'] = 3
# Urine output < 200 ml/day
sofa_daily_urine_output.loc[(
    (sofa_daily_urine_output['time'] > 11) &
        (sofa_daily_urine_output['daily_urine_output'] < 200)),
    'sofa_renal_score'] = 4
sofa_daily_urine_output.head()

# Get serum creatinine (baseline from -365 days from admission)
baseline_creatinine = numerics_creatinine.groupby(['admissionid']).agg(
        baseline_creatinine=pd.NamedAgg(column='value', aggfunc='min')
    ).reset_index()
# Max creatinine on each day (but only from MIN_TIME rather than -365 days)
max_creatinine = numerics_creatinine.loc[
    numerics_creatinine['time'] >= 0]
max_creatinine = max_creatinine.groupby(['admissionid', 'time']).agg(
        max_creatinine=pd.NamedAgg(column='value', aggfunc='max')
    ).reset_index()
# Merge baseline on admissionid only and max on both admissionid and time
creatinine = pd.merge(
    numerics_creatinine, baseline_creatinine, on='admissionid', how='left')
creatinine = pd.merge(
    creatinine, max_creatinine, on=['admissionid', 'time'], how='right')

creatinine['acute_renal_failure'] = False
# AKI definition: 3 fold increase
creatinine.loc[(
        (creatinine['baseline_creatinine'] > 0) &
        (creatinine['max_creatinine'] /
            creatinine['baseline_creatinine'] > 3)),
    'acute_renal_failure'] = True
# AKI definition: increase to >= 354 umol/l AND at least 44 umol/l increase
creatinine.loc[(
        (creatinine['max_creatinine'] >= 354) &
        ((creatinine['max_creatinine'] -
            creatinine['baseline_creatinine']) >= 44)),
    'acute_renal_failure'] = True

creatinine.drop(
    columns=['unitid', 'measuredat', 'fluidout'],
    inplace=True)

creatinine.head()

# Looking at the data it's relevatively easy to spot most lab collection errors
# (i.e. single outliers between relatively normal values)
# Remove extreme outliers, most likely data entry errors (manual_entry = True)
creatinine.loc[(
        (creatinine['value'] < 30) ), 'value'] = np.nan
creatinine = creatinine.dropna(subset=['value'])

# Get highest creatinine per 24 hours
# Use creatinine 'cleansed' dataframe from APACHE score
sofa_renal_creatinine = creatinine.groupby(['admissionid', 'time']).agg(
        max_creatinine=pd.NamedAgg(column='value', aggfunc='max')
    ).reset_index()
sofa_renal_creatinine.head()
# Calculate SOFA renal score for creatinine:
sofa_renal_creatinine['sofa_renal_score'] = 0
# Creatinine 110-170 umol/l
sofa_renal_creatinine.loc[(
        (sofa_renal_creatinine['max_creatinine'] >= 110) &
        (sofa_renal_creatinine['max_creatinine'] < 171)),
     'sofa_renal_score'] = 1
# Creatinine 171-299 umol/l
sofa_renal_creatinine.loc[(
        (sofa_renal_creatinine['max_creatinine'] >= 171) &
        (sofa_renal_creatinine['max_creatinine'] < 300)),
    'sofa_renal_score'] = 2
# Creatinine 300-440 umol/l
sofa_renal_creatinine.loc[(
        (sofa_renal_creatinine['max_creatinine'] >= 300) &
        (sofa_renal_creatinine['max_creatinine'] <= 440)),
     'sofa_renal_score'] = 3
# Creatinine >440 umol/l
sofa_renal_creatinine.loc[(
        (sofa_renal_creatinine['max_creatinine'] > 440)),
    'sofa_renal_score'] = 4
sofa_renal_creatinine.head()

# Combine the scores from creatinine and urine output
sofa_renal = pd.concat(
        [sofa_renal_creatinine, sofa_daily_urine_output], sort=False,
    ).sort_values(by=['admissionid', 'time'])

sofa_renal.head()


Unnamed: 0,admissionid,time,max_creatinine,sofa_renal_score,hourly_urine_output,admittedat,daily_urine_output
0,0,0,,0,0.0,0.0,0.0
1,0,1,,0,0.0,0.0,0.0
2,0,2,,0,0.0,0.0,0.0
3,0,3,,0,0.0,0.0,0.0
4,0,4,,0,0.0,0.0,0.0


## Final SOFA scores

In [27]:
def join_sofa(sofa, scores):
    '''
    Function to merge new sub-component scores to the full SOFA dataframe.
    We only want to keep columns where the 'day' (24hr periods post-admission)
    match, but some 'days' in the new SOFA component score may not be in the
    previous SOFA component scores (and we want to keep this)
    So for these rows, set all previous SOFA component scores to nan and then
    only keep matching 'days'
    '''
    scores = scores.sort_values(by=['admissionid', 'time']).reset_index()
    sofa = pd.concat([sofa, scores[['admissionid', 'time']]])
    sofa = sofa.loc[~sofa[['admissionid', 'time']].duplicated()]
    sofa = sofa.sort_values(by=['admissionid', 'time']).reset_index(drop=True)
    sofa = pd.merge(sofa, scores, on=['admissionid', 'time'], how='left')
    return sofa

# Merge the scores
sofa = admissions_df['admissionid']

# Max respiration score (don't need some steps in the function the first time)
scores = sofa_respiration.groupby(['admissionid', 'time']).agg(
    sofa_respiration_score=pd.NamedAgg(
        column='sofa_respiration_score', aggfunc='max'))
scores = scores.sort_values(by=['admissionid', 'time']).reset_index()
sofa = pd.merge(sofa, scores, on='admissionid', how='left')

# Max coagulation score
scores = sofa_platelets.groupby(['admissionid', 'time']).agg(
    sofa_coagulation_score=pd.NamedAgg(
        column='sofa_coagulation_score', aggfunc='max'))
sofa = join_sofa(sofa, scores)

# Max liver score
scores = sofa_bilirubin.groupby(['admissionid', 'time']).agg(
    sofa_liver_score=pd.NamedAgg(column='sofa_liver_score', aggfunc='max'))
sofa = join_sofa(sofa, scores)

# Max cardiovascular score
scores = sofa_cardiovascular.groupby(['admissionid', 'time']).agg(
    sofa_cardiovascular_score=pd.NamedAgg(
        column='sofa_cardiovascular_score', aggfunc='max'))
sofa = join_sofa(sofa, scores)

# Max central nervous system score
scores = sofa_cns.groupby(['admissionid', 'time']).agg(
    sofa_cns_score=pd.NamedAgg(column='sofa_cns_score', aggfunc='max'))
sofa = join_sofa(sofa, scores)

# Max renal score
scores = sofa_renal.groupby(['admissionid', 'time']).agg(
    sofa_renal_score=pd.NamedAgg(column='sofa_renal_score', aggfunc='max'))
sofa = join_sofa(sofa, scores)

# Calculate total score (add al values in columns)
total_scores = sofa.set_index(['admissionid', 'time']).sum(
    axis=1, skipna=True).to_frame('sofa_total_score')
sofa = pd.merge(sofa, total_scores, on=['admissionid', 'time'], how='left')
sofa.head()

sofa.dropna(subset=['time'], inplace=True)

# save as .csv file
sofa.to_csv( './sofa.csv', index=False)

# sepsis筛选：满足以上两个条件

![image.png](attachment:image.png)

In [6]:
sofa = pd.read_csv( './sofa.csv')
sofa["time_diff"] = sofa["time"].diff()
sofa.loc[sofa["time_diff"]<0,"time_diff" ]=np.nan
sofa = sofa.drop(sofa[sofa["time_diff"]>=168].index) #时间差大于3天的记录被删除
sofa = sofa.drop(["time_diff"],axis=1)
sofa = sofa[sofa["time"]>=0]

abx_escalation = pd.read_csv( './infection_time.csv')
combined_diagnoses = pd.read_csv('/public/hanl/jupyter_dir/database/AMUCdb/clean/disease_definition/sepsis_episode/ref_sepsis_definition/out/combined_diagnoses.csv', encoding='latin-1')

In [273]:
sepsis = pd.concat([sofa, abx_escalation[['admissionid', 'time']]])
sepsis = sepsis.loc[~sepsis[['admissionid', 'time']].duplicated()]
sepsis = sepsis.sort_values(by=['admissionid', 'time']).reset_index(drop=True)
sepsis.loc[(sepsis['sofa_total_score'].isna() & (sepsis['time'] < 0)),'sofa_total_score'] = 0
sepsis = sepsis.dropna(subset=['time']).reset_index(drop=True)

sofa_components = ['sofa_respiration_score', 'sofa_coagulation_score']
sofa_components += ['sofa_liver_score', 'sofa_cardiovascular_score']
sofa_components += ['sofa_cns_score', 'sofa_renal_score']
sepsis['n_sofa_scores'] = sepsis[sofa_components].notna().sum(axis=1)

# Shah et al. (DOI: 10.1097/CCM.0000000000005169) in a Sepsis-3 review remove
# any patients with less than 3 SOFA scores in the first 24hrs, we mark these
# patients in case we want to do the same

# sepsis['discard'] = False
# sepsis.loc[((sepsis['n_sofa_scores'] < 3) &(sepsis['time']<24)),'discard'] = True
# discard_ind = sepsis['admissionid'].isin(sepsis.loc[sepsis['discard'], 'admissionid'])
# sepsis.loc[discard_ind, 'discard'] = True

# Merge sofa scores with antibiotic escalation ressults
sepsis = pd.merge(sepsis, abx_escalation, on=['admissionid', 'time'], how='left')
adti_idlst = sepsis[sepsis['antibiotic_escalation']==True].admissionid.unique()
sepsis = sepsis[sepsis.admissionid.isin(adti_idlst)]

In [15]:
sepsis

Unnamed: 0,admissionid,time,sofa_respiration_score,sofa_coagulation_score,sofa_liver_score,sofa_cardiovascular_score,sofa_cns_score,sofa_renal_score,sofa_total_score,n_sofa_scores
0,0,0.0,,,,,,0.0,0.0,1
1,0,1.0,,,,,,0.0,0.0,1
2,0,2.0,,,,,,0.0,0.0,1
3,0,3.0,,,,,,0.0,0.0,1
4,0,4.0,,0.0,,,,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...
2494983,23552,14.0,,,,0.0,,0.0,0.0,2
2494984,23552,15.0,,,,0.0,0.0,0.0,0.0,3
2494985,23552,16.0,,,,0.0,,,0.0,1
2494986,23552,17.0,,,,0.0,0.0,,0.0,2


In [276]:
# SOFA increase corresponds to three time periods: previous and current day,
# current and subsequent day, previous and subsequent day (see Shah et al. for
# more details)
sepsis["sofa_previous_day"] = sepsis["sofa_total_score"].rolling(window=24,min_periods=1).min()
#sepsis["sofa_subsequent_day"] = sepsis["sofa_total_score"].rolling(FixedForwardWindowIndexer(window_size=24),min_periods=1).max()

# Change in SOFA between previous and current 24hr periods
sepsis['sofa_diff0'] = sepsis["sofa_total_score"] - sepsis["sofa_previous_day"]
# Change in SOFA between current and subsequent 24hr periods
#sepsis['sofa_diff1'] = sepsis["sofa_subsequent_day"] - sepsis["sofa_total_score"]
# Change in SOFA between previous and subsequent 24hr periods
#sepsis['sofa_diff2'] = sepsis["sofa_subsequent_day"] - sepsis["sofa_previous_day"]

# If this is the first entry for the patient, then there is no 'previous day'
# So the difference in these cases is simply the (total score - 0). We assume a
# SOFA score of 0 on admission)
#diff_02_cols = ['sofa_diff0', 'sofa_diff2']
#sepsis.loc[~sepsis['admissionid'].duplicated(), diff_02_cols] = (sepsis.loc[~sepsis['admissionid'].duplicated(), 'sofa_total_score'])

# On the last day for that patient, sofa_diff1 and sofa_diff2 should not exist
#sepsis.loc[(~sepsis['admissionid'].duplicated(keep='last')),['sofa_diff1', 'sofa_diff2']] = np.nan

# A sepsis episode is defined as antibiotics escalation ('infection')
# accompanied by SOFA increase of 2 or more.
#sepsis['sepsis_episode'] = ((sepsis['antibiotic_escalation']) &((sepsis['sofa_diff0'] >= 2) |(sepsis['sofa_diff1'] >= 2) |(sepsis['sofa_diff2'] >= 2)))

sepsis['sepsis_episode'] =  (sepsis['sofa_diff0'] >= 2)

sepsis_lst = sepsis[sepsis.sepsis_episode==True].admissionid.unique().tolist()
sepsis = sepsis[sepsis.admissionid.isin(sepsis_lst)]
print("疑似脓毒症患者数：",str(len(sepsis_lst)))

疑似脓毒症患者数： 15210


In [279]:
# Next: find patients who had elective surgery, these may have been given
# antibiotics for prophylactic use in the first 24hr after admission
# and are not classified as sepsis even if having a high SOFA score
# (subsequent antibiotic escalation and SOFA increase is marked as sepsis)
surgical_cols = ['admissionid', 'surgical', 'urgency']
sepsis = pd.merge(sepsis, combined_diagnoses[surgical_cols], on='admissionid', how='left')
sepsis['elective_surgery'] = ((sepsis['surgical'] == 1) &(sepsis['urgency'] == 0))
sepsis['emergency_surgery'] = ((sepsis['surgical'] == 1) &(sepsis['urgency'] == 1))

# Assume prophylactic treatment rather than sepsis if elective surgery
# accompanied by high SOFA/antibiotics 
sepsis['prophylaxis'] = (sepsis['sepsis_episode'] & sepsis['elective_surgery'])
sepsis['prophylaxis'].iloc[:-1] |= (
    (sepsis['sepsis_episode'].iloc[:-1] &
        (sepsis.loc[1:, 'elective_surgery'].values)))
# If prophylaxis is assumed, then not a sepsis episode
sepsis['sepsis_episode'] &= ~sepsis['prophylaxis']
#sepsis['infection'] = (sepsis['antibiotic_escalation'] & ~sepsis['prophylaxis'])

sepsis_lst = sepsis[sepsis.sepsis_episode==True].admissionid.unique().tolist()
print("疑似脓毒症患者数：",str(len(sepsis_lst)))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


疑似脓毒症患者数： 8565


In [284]:
sepsis_cols = ['admissionid', 'time', 'sofa_total_score']
#sepsis_cols += ['antibiotic_escalation', 'prophylaxis']
sepsis_cols += ['sepsis_episode']
sepsis_output = sepsis[sepsis_cols]
sepsis_lst = sepsis_output[sepsis_output.sepsis_episode==True].admissionid.unique().tolist()
print("疑似脓毒症患者数：",str(len(sepsis_lst)))

疑似脓毒症患者数： 8565


In [286]:
sepsis_output = sepsis_output.sort_values(["admissionid","time"])
sepsis_output_first = sepsis_output[sepsis_output.sepsis_episode==True]
sepsis_output_first = sepsis_output_first.groupby("admissionid").head(1)[["admissionid","time"]]
sepsis_output_first.columns = ["admissionid","sepsis_onset"]
sepsis_output_first.to_csv("./sepsis_onset.csv",index=False)

In [1]:
pwd

'/public/hanl/jupyter_dir/database/AMUCdb/clean/disease_definition/sepsis_episode'