In [3]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from itertools import combinations
from collections import defaultdict

In [4]:
data_fp = '/Users/JakeCanfield/Documents/Trauma_Surgery_Research/Data/Raw_data/PUF AY %i/CSV/'
years = range(2017, 2023)
thoracotomy_codes = ['0WJB0ZZ', '0WJ90ZZ', '02JA0ZZ', '02JY0ZZ', '0BJL0ZZ', '0BJK0ZZ', '0BJQ0ZZ','0WJ80ZZ', '0WJC0ZZ', '0WJD0ZZ', '02VW0CZ', '02QA0ZZ', '3E080GC']
sternotomy_codes = ['0P800ZZ']

mechanism_code_dict = {1:'Cut/pierce', 2:'Drowning/submersion', 3:'Fall', 4:'Fire/flame', 5:'Hot object/substance', 6:'Firearm', 7:'Machinery', 8:'MVT Occupant', 9:'MVT Motorcyclist', 10:'MVT Pedal cyclist', 11:'MVT Pedestrian', 12:'MVT Unspecified', 13:'MVT Other', 14:'Pedal cyclist, other', 15:'Pedestrian, other', 16:'Transport, other', 17:'Natural/environmental,  Bites and stings', 18:'Natural/environmental,  Other', 19:'Overexertion', 20:'Poisoning', 21:'Struck by, against', 22:'Suffocation', 23:'Other specified and classifiable', 24:'Other specified, not elsewhere classifiable', 25:'Unspecified', 26:'Adverse effects, medical care', 27:'Adverse effects, drugs'} # As noted in PUF dictionary
trauma_type_code_dict = {1:'Blunt', 2:'Penetrating', 3:'Burn', 4:'Other/unspecified', 9:'Activity Code - Not Valid as a Primary E-Code'} # As noted in PUF Dictionary
sex_code_dict = {1:'Male', 2:'Female', 3:'Unknown'}
eddischarge_code_dict = {1: 'Floor bed (general admission, non-specialty unit bed)', 2: 'Observation unit (unit that provides < 24 hour stays)', 3: 'Telemetry/step-down unit (less acuity than ICU)', 4: 'Home with services', 5: 'Deceased/expired', 6: 'Other (jail, institutional care, mental health, etc.)', 7: 'Operating Room', 8: 'Intensive Care Unit (ICU)', 9: 'Home without services', 10: 'Left against medical advice', 11: 'Transferred to another hospital'}
hospdischarge_disposition_code_dict = {1: 'Discharged/Transferred to a short-term general hospital for inpatient care', 2: 'Discharged/Transferred to an Intermediate Care Facility (ICF)', 3: 'Discharged/Transferred to home under care of organized home health service', 4: 'Left against medical advice or discontinued care', 5: 'Deceased/Expired', 6: 'Discharged to home or self-care (routine discharge)', 7: 'Discharged/Transferred to Skilled Nursing Facility (SNF)', 8: 'Discharged/Transferred to hospice care', 10: 'Discharged/Transferred to court/law enforcement', 11: 'Discharged/Transferred to inpatient rehab or designated unit', 12: 'Discharged/Transferred to Long Term Care Hospital (LTCH)', 13: 'Discharged/Transferred to a psychiatric hospital or psychiatric distinct part unit of a hospital', 14: 'Discharged/Transferred to another type of institution not defined elsewhere'}
deathined_code_dict = {1:'Arrived with NO signs of life', 2:'Arrived with signs of life'}
prehospca_code_dict = {1:'Yes', 2:'No'}
transport_mode_code_dict = {1:'Ground Ambulance', 2:'Helicopter Ambulance', 3:'Fixed-wing Ambulance', 4:'Private/Public Vehicle/Walk-in', 5:'Police', 6:'Other'}

cols17 = pd.read_csv(data_fp%(2017)+'PUF_TRAUMA.csv', nrows=1).columns.tolist()
cols19 = pd.read_csv(data_fp%(2017)+'PUF_TRAUMA.csv', nrows=1).columns.tolist()
cols17 = [x.upper() for x in cols17]
cols19 = [x.upper() for x in cols19]

common_cols = list(set(cols17) & set(cols19))

In [53]:
TRAUMA_fp = '/Users/JakeCanfield/Documents/Trauma_Surgery_Research/data/Combined_data/newTRAUMA_df_%i.csv'
ICDPROCEDURE_fp = '/Users/JakeCanfield/Documents/Trauma_Surgery_Research/data/Combined_data/newICDPROCEDURE_df_%i.csv'

years = range(2017, 2023)

# Preload column names
TRAUMA_cols = pd.read_csv(TRAUMA_fp % 2017, nrows=1).columns.tolist()
ICDPROCEDURE_cols = pd.read_csv(ICDPROCEDURE_fp % 2017, nrows=1).columns.tolist()

# Collect DataFrames in lists
trauma_dfs = []
icdprocedure_dfs = []

for year in years:
    trauma_df = pd.read_csv(TRAUMA_fp % year, usecols=TRAUMA_cols)
    trauma_df['Year'] = year
    icdprocedure_df = pd.read_csv(ICDPROCEDURE_fp % year, usecols=ICDPROCEDURE_cols)
    icdprocedure_df['Year'] = year
    trauma_dfs.append(trauma_df)
    icdprocedure_dfs.append(icdprocedure_df)

# Concatenate DataFrames once at the end
TRAUMA_all_df = pd.concat(trauma_dfs, ignore_index=True)
ICDPROCEDURE_all_df = pd.concat(icdprocedure_dfs, ignore_index=True)

TRAUMA_all_df['SEX'] = TRAUMA_all_df['SEX'].replace({'Not Known BIU 2': 3.0, 'Not Known/Not Recorded BIU 2': 3.0, 'Unknown': 3.0})
#TRAUMA_all_df['DEATHINED'] = TRAUMA_all_df['DEATHINED'].replace(deathined_code_dict)
#TRAUMA_all_df['TRANSPORTMODE'] = TRAUMA_all_df['TRANSPORTMODE'].replace(transport_mode_code_dict)
#TRAUMA_all_df['PREHOSPITALCARDIACARREST'] = TRAUMA_all_df['PREHOSPITALCARDIACARREST'].replace(prehospca_code_dict)
TRAUMA_all_df = TRAUMA_all_df.dropna(subset=['HMRRHGCTRLSURGMINS'])
TRAUMA_all_df = TRAUMA_all_df.dropna(subset=['AGEYEARS'])
TRAUMA_all_df['HMRRHGCTRLSURGMINS'] = TRAUMA_all_df['HMRRHGCTRLSURGMINS'].round().astype(int)
TRAUMA_all_df = TRAUMA_all_df.loc[(TRAUMA_all_df['HMRRHGCTRLSURGMINS'] <= 20) & (TRAUMA_all_df['HMRRHGCTRLSURGDAYS'] <= 1.0)]
ICDPROCEDURE_all_df = ICDPROCEDURE_all_df.loc[ICDPROCEDURE_all_df['INC_KEY'].isin(np.unique(TRAUMA_all_df['INC_KEY']))]

# Display results
display(TRAUMA_all_df.tail())
print(np.shape(TRAUMA_all_df))
display(ICDPROCEDURE_all_df.tail())
print(np.shape(ICDPROCEDURE_all_df))

# Free up memory
del trauma_dfs, icdprocedure_dfs

Unnamed: 0,ABUSEINVESTIGATION,ABUSEINVESTIGATION_BIU,ABUSEREPORT,ABUSEREPORT_BIU,ADDITIONALECODEICD10,ADDITIONALECODEICD10_BIU,AGEYEARS,AIRBAG_DEPLOYED_FRNT,AIRBAG_DEPLOYED_NA,AIRBAG_DEPLOYED_OTHER,...,WITHDRAWALLST,WITHDRAWALLSTDAYS,WITHDRAWALLSTMINS,WITHDRAWALLST_BIU,WORKRELATED,WORKRELATED_BIU,YODISCH,MECHANISM,TRAUMATYPE,Year
17585,,,,,,,21.0,0,1,0,...,2.0,,,,2.0,,,Firearm,Penetrating,2022
17587,,,,,,,39.0,0,1,0,...,2.0,,,,2.0,,,Cut/pierce,Penetrating,2022
17588,,,,,,,55.0,0,1,0,...,2.0,,,,2.0,,,Cut/pierce,Penetrating,2022
17589,,,,,,,41.0,0,1,0,...,2.0,,,,2.0,,,Firearm,Penetrating,2022
17590,,,,,,,35.0,0,1,0,...,2.0,,,,2.0,,,Cut/pierce,Penetrating,2022


(8274, 333)


Unnamed: 0,INC_KEY,ICDPROCEDURECODE,PROCEDUREMINS,PROCEDUREDAYS,Year
274140,220072330368,Transfusion of Nonautologous Red Blood Cells i...,7.2,1.0,2022
274141,220072330368,Introduction of Other Therapeutic Substance in...,0.0,1.0,2022
274142,220072330368,"Performance of Cardiac Output, Single, Manual",0.0,1.0,2022
274143,220072330368,"Respiratory Ventilation, Less than 24 Consecut...",3.0,1.0,2022
274144,220072330368,Ultrasonography of Abdomen,0.0,1.0,2022


(103042, 5)


In [54]:
# Step 1: Calculate the threshold for 10% NaN values
threshold = len(TRAUMA_all_df) * 0.1

# Step 2: Drop columns with more than 10% NaN values
TRAUMA_all_df_filt = TRAUMA_all_df.loc[:, TRAUMA_all_df.isna().sum() <= threshold]

# Step 3: Drop rows with any NaN values
TRAUMA_all_df_cleaned = TRAUMA_all_df_filt.dropna()

display(TRAUMA_all_df_cleaned)

Unnamed: 0,AGEYEARS,AIRBAG_DEPLOYED_FRNT,AIRBAG_DEPLOYED_NA,AIRBAG_DEPLOYED_OTHER,AIRBAG_DEPLOYED_SIDE,AIRBAG_DEPLOYED_UK,AIRBAG_NOTDEPLOYED,ALCOHOLSCREEN,AMERICANINDIAN,ANGIOGRAPHY,...,TM_UK,TOTALGCS,TRANSPORTMODE,VTEPROPHYLAXISTYPE,WHITE,WITHDRAWALLST,WORKRELATED,MECHANISM,TRAUMATYPE,Year
4,24.0,0,1,0,0,0,0,2.0,0,3.0,...,1,3.0,1.0,1.0,0,2.0,2.0,Firearm,Penetrating,2017
7,31.0,0,1,0,0,0,0,2.0,0,1.0,...,0,15.0,1.0,6.0,1,2.0,2.0,Cut/pierce,Penetrating,2017
10,37.0,0,1,0,0,0,0,1.0,0,1.0,...,0,15.0,1.0,6.0,0,2.0,2.0,Cut/pierce,Penetrating,2017
34,23.0,0,0,0,0,1,0,2.0,0,1.0,...,0,3.0,2.0,5.0,1,2.0,2.0,MVT Occupant,Blunt,2017
36,29.0,0,1,0,0,0,0,1.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,Firearm,Penetrating,2017
39,65.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,Firearm,Penetrating,2017
53,57.0,0,1,0,0,0,0,2.0,0,1.0,...,0,7.0,1.0,5.0,1,2.0,2.0,MVT Pedestrian,Blunt,2017
54,66.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,MVT Pedestrian,Blunt,2017
56,27.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,"Transport, other",Blunt,2017
59,18.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,0,2.0,2.0,Firearm,Penetrating,2017


In [55]:
# Step 1: Identify columns with string values
string_columns = TRAUMA_all_df_cleaned.select_dtypes(include='object').columns

# Step 2: Initialize a dictionary to store mapping dictionaries
mapping_dicts = {}

# Step 3: Convert each unique string to an integer and create mapping dictionaries
for col in string_columns:
    unique_strings = TRAUMA_all_df_cleaned[col].unique()
    string_to_int = {string: idx for idx, string in enumerate(unique_strings)}
    int_to_string = {idx: string for string, idx in string_to_int.items()}
    
    # Store the mapping dictionaries
    mapping_dicts[col] = {
        'string_to_int': string_to_int,
        'int_to_string': int_to_string
    }
    
    # Map the strings to integers in the DataFrame
    TRAUMA_all_df_cleaned[col] = TRAUMA_all_df_cleaned[col].map(string_to_int)

# Display the modified DataFrame and the mapping dictionaries
display(TRAUMA_all_df_cleaned)
print(mapping_dicts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,AGEYEARS,AIRBAG_DEPLOYED_FRNT,AIRBAG_DEPLOYED_NA,AIRBAG_DEPLOYED_OTHER,AIRBAG_DEPLOYED_SIDE,AIRBAG_DEPLOYED_UK,AIRBAG_NOTDEPLOYED,ALCOHOLSCREEN,AMERICANINDIAN,ANGIOGRAPHY,...,TM_UK,TOTALGCS,TRANSPORTMODE,VTEPROPHYLAXISTYPE,WHITE,WITHDRAWALLST,WORKRELATED,MECHANISM,TRAUMATYPE,Year
4,24.0,0,1,0,0,0,0,2.0,0,3.0,...,1,3.0,1.0,1.0,0,2.0,2.0,0,0,2017
7,31.0,0,1,0,0,0,0,2.0,0,1.0,...,0,15.0,1.0,6.0,1,2.0,2.0,1,0,2017
10,37.0,0,1,0,0,0,0,1.0,0,1.0,...,0,15.0,1.0,6.0,0,2.0,2.0,1,0,2017
34,23.0,0,0,0,0,1,0,2.0,0,1.0,...,0,3.0,2.0,5.0,1,2.0,2.0,2,1,2017
36,29.0,0,1,0,0,0,0,1.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,0,0,2017
39,65.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,0,0,2017
53,57.0,0,1,0,0,0,0,2.0,0,1.0,...,0,7.0,1.0,5.0,1,2.0,2.0,3,1,2017
54,66.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,3,1,2017
56,27.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,1,2.0,2.0,4,1,2017
59,18.0,0,1,0,0,0,0,2.0,0,1.0,...,0,3.0,1.0,5.0,0,2.0,2.0,0,0,2017


{'EDDISCHARGEDISPOSITION': {'string_to_int': {'Operating Room': 0, 'Deceased/expired': 1, 'Intensive Care Unit (ICU)': 2, 'Telemetry/step-down unit (less acuity than ICU)': 3, 'Home without services': 4, 'Floor bed (general admission, non-specialty unit bed)': 5, 'Transferred to another hospital': 6, 'Observation unit (unit that provides < 24 hour stays)': 7}, 'int_to_string': {0: 'Operating Room', 1: 'Deceased/expired', 2: 'Intensive Care Unit (ICU)', 3: 'Telemetry/step-down unit (less acuity than ICU)', 4: 'Home without services', 5: 'Floor bed (general admission, non-specialty unit bed)', 6: 'Transferred to another hospital', 7: 'Observation unit (unit that provides < 24 hour stays)'}}, 'PLACEOFINJURYCODE': {'string_to_int': {'Y92.480': 0, 'Y92.149': 1, 'Y92.414': 2, 'Y92.413': 3, 'Y92.017': 4, 'Y92.830': 5, 'Y92.009': 6, 'Y92.320': 7, 'Y92.039': 8, 'Y92.019': 9, 'Y92.013': 10, 'Y92.410': 11, 'Y92.481': 12, 'Y92.73': 13, 'Y92.9': 14, 'Y92.411': 15, 'Y92.828': 16, 'Y92.018': 17, 'Y92

In [57]:
display(TRAUMA_all_df_cleaned['HOSPDISCHARGEDISPOSITION'])
# Create the 'DECEASED' column
TRAUMA_all_df_cleaned['DECEASED'] = 0  # Initialize with 0 (Survived)

# Update 'DECEASED' to 1 where patient is deceased according to either column
TRAUMA_all_df_cleaned.loc[(TRAUMA_all_df_cleaned['EDDISCHARGEDISPOSITION'] == 5) | 
                  (TRAUMA_all_df_cleaned['HOSPDISCHARGEDISPOSITION'] == 5), 'DECEASED'] = 1

# Drop rows where both columns are 'Not Applicable'
TRAUMA_all_df_cleaned = TRAUMA_all_df_cleaned[~((TRAUMA_all_df['EDDISCHARGEDISPOSITION'] == 'Not Applicable') & 
                                (TRAUMA_all_df_cleaned['HOSPDISCHARGEDISPOSITION'] == 'Not Applicable'))]

# Display the resulting DataFrame
display(TRAUMA_all_df_cleaned)


KeyError: 'HOSPDISCHARGEDISPOSITION'