In [5]:
import pandas as pd
import numpy as np
import os
import sys

sys.path.append('./../src/')
from manuscript import sankey_side_by_side as sankey
from manuscript import clustering, datasets, inout, export, stats

pd.set_option('display.max_columns', None)

In [42]:
user = 'general'     # defines top hierarchy of output folder
outfolder = '30_VAP_flags'    # name of notebook

def dump_table(df, name):
    export.full_frame(
        user, 
        f'{outfolder}/{name}', 
        df, 
        index=True,
        date=True
    )

def dump_figure(name):
    export.image(
        user,
        f'{outfolder}/{name}',
    )

In [25]:
data = pd.read_csv(
    inout.get_material_path('CAG/05_join/05_data_umap_clusters_220901_1211.csv.gz'), 
    index_col=0)

In [26]:
data.groupby(['Episode_category','Episode_etiology']).size()

Episode_category  Episode_etiology
CAP               Bacterial            40
                  Bacterial/viral      17
                  Culture-negative     32
                  Viral                47
HAP               Bacterial            69
                  Bacterial/viral      25
                  Culture-negative     56
                  Indeterminate         2
                  Viral                62
VAP               Bacterial           117
                  Bacterial/viral     126
                  Culture-negative     34
                  Indeterminate         2
                  Viral                49
dtype: int64

In [27]:
# flag patients with adjudication data

has_ep = data.sort_values(by=['Episode_category'], ascending=False)


In [28]:
has_ep = has_ep.drop_duplicates(subset=['Patient_id'], keep='first')


In [29]:
has_ep.Episode_category.notnull().sum()

585

In [30]:
has_ep['adjudicated']=np.where(has_ep.Episode_category.notnull(), 1, 0)

In [31]:
has_ep.adjudicated.value_counts()

1    585
Name: adjudicated, dtype: int64

In [32]:
has_ep2 = has_ep[['Patient_id', 'adjudicated']]

In [33]:
data2 = pd.merge(data, has_ep2, how='left', on='Patient_id')
#join back to main dataframe 

In [34]:
# subset VAP 
vap = data.loc[data['Episode_category']=='VAP',['Patient_id','Episode_etiology','Episode_is_cured']]

# take only non-viral VAPs
non_viral_vap = vap[~(vap.Episode_etiology=='Viral')]

In [35]:
non_viral_vap['had_nonviral_vap']=1
non_viral_vap['vap_NOTcured']=np.where(non_viral_vap['Episode_is_cured']=='Not cured', 1, 0)
non_viral_vap['vap_indeterminate']=np.where(non_viral_vap['Episode_is_cured']=='Indeterminate', 1, 0)
non_viral_vap['vap_indeterminate_notcured']=non_viral_vap['vap_indeterminate']+non_viral_vap['vap_NOTcured']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_viral_vap['had_nonviral_vap']=1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_viral_vap['vap_NOTcured']=np.where(non_viral_vap['Episode_is_cured']=='Not cured', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_viral_vap['vap_indeterminate']=np.where(non_viral_vap['Episode_is_cured'

In [36]:
#multiple VAPs per admission - take 1 if any vap and 1 if any vap indeterminate/not cured
non_viral_vap = non_viral_vap.sort_values(by=['vap_indeterminate_notcured'], ascending=False)
non_viral_vap_single = non_viral_vap.drop_duplicates(subset=['Patient_id'], keep='first')

In [37]:
#take just columns of interest
non_viral_vap_single_short=non_viral_vap_single[['Patient_id', 'had_nonviral_vap','vap_indeterminate_notcured']]


In [38]:
data3 = pd.merge(data2, non_viral_vap_single_short, how='left', on='Patient_id')
#join back to main dataframe 

In [39]:
data3['had_nonviral_vap'] = data3['had_nonviral_vap'].fillna(0)
data3['vap_indeterminate_notcured'] = data3['vap_indeterminate_notcured'].fillna(0)

In [40]:
data3.adjudicated.value_counts()

1    12495
Name: adjudicated, dtype: int64

In [43]:
dump_table(data3, 'data_vap_flags.csv.gz')