In [5]:
import zipfile
import os
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import boto3
import pandas as pd
from io import BytesIO
from io import StringIO

In [6]:
# Initialize S3 client
s3 = boto3.client('s3')

# Function to load data 
bucket_name = 'ml-stack-759578648427-us-west-2'

def from_s3(bucket_name, file_name):
    response = s3.get_object(Bucket=bucket_name, Key=file_name)
    ext = os.path.splitext(file_name)[1]
    if ext == '.gz':
        data = pd.read_csv(BytesIO(response['Body'].read()), compression='gzip')
    elif ext == '.csv':
        data = pd.read_csv(BytesIO(response['Body'].read()))
    else:
        data = pd.read_excel(BytesIO(response['Body'].read()), sheet_name='DX_to_CCSR_Mapping', header=1)
    return data

# Diagnosis Mapping

Use ICD 9 to 10 crosswalk from: https://github.com/JeanNikiema/mimicinicd11/tree/main

Use broad category map from: https://hcup-us.ahrq.gov/toolssoftware/ccsr/dxccsr.jsp

In [7]:
file_name = 'MIMIC/d_icd_diagnoses.csv.gz'
icd_diagnoses = from_s3(bucket_name, file_name) 

# Load the ICD-9 to ICD-10 mapping using the GEM
file_name = 'MIMIC/icd9toicd10cmgem.csv'
map_gem = from_s3(bucket_name, file_name) 
map_gem = map_gem[map_gem['icd10cm'] != "NoDx"][['icd9cm', 'icd10cm']]

# Load manually matched codes for "NoDx"
nodx_match = 'MIMIC/nodx_code.csv'
nodx_match = from_s3(bucket_name, file_name) 
nodx_match['Maptype'] = "manual matching"
nodx_match = nodx_match[['icd9cm', 'icd10cm']]
nodx_match['icd10cm'] = nodx_match['icd10cm'].str.replace('.', '')

# seperate versions
icd_d9 = icd_diagnoses[icd_diagnoses.icd_version == 9]
icd_d10 = icd_diagnoses[icd_diagnoses.icd_version == 10]

# map codes
icd9_to10 = pd.merge(icd_d9, map_gem, left_on='icd_code', right_on='icd9cm', how='left')
icd_to10_complete = icd9_to10[~icd9_to10['icd10cm'].isna()]

icd9_to10_left = icd9_to10[icd9_to10['icd10cm'].isna()]
icd9_to10_left.drop(columns=['icd9cm', 'icd10cm'], axis=1, inplace=True)
icd9_to10_left = icd9_to10_left.merge(nodx_match, left_on='icd_code', right_on='icd9cm', how='left')

icd10_to10 = pd.merge(icd_d10, map_gem, left_on='icd_code', right_on='icd10cm', how='left')
icd10_to10['icd10cm'].fillna(icd10_to10['icd_code'], inplace=True)
map_gem = pd.concat([icd_to10_complete, icd9_to10_left, icd10_to10], axis=0)
map_gem.drop_duplicates(inplace=True)

# load in CCSR map
file_name = 'MIMIC/DXCCSR-Reference-File-v2025-1.xlsx'
ccsr_map = from_s3(bucket_name, file_name) 
ccsr_map.columns = ccsr_map.columns.str.lower().str.replace(' ', '_')
ccsr_map = ccsr_map[['icd-10-cm_code', 'ccsr_category', 'ccsr_category_description']]

# final mapping
icd_code_map = map_gem.merge(ccsr_map, left_on='icd10cm', right_on='icd-10-cm_code', how='left').drop(columns=['icd-10-cm_code'], axis=1)
icd_code_map = icd_code_map[~icd_code_map.icd10cm.isna()]
del icd_diagnoses, map_gem, nodx_match, ccsr_map, icd_d9, icd_d10, icd9_to10, icd_to10_complete, icd9_to10_left, icd10_to10
icd_code_map.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icd9_to10_left.drop(columns=['icd9cm', 'icd10cm'], axis=1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  icd10_to10['icd10cm'].fillna(icd10_to10['icd_code'], inplace=True)


Unnamed: 0,icd_code,icd_version,long_title,icd9cm,icd10cm,ccsr_category,ccsr_category_description
0,10,9,Cholera due to vibrio cholerae,10,A000,DIG001,Intestinal infection
1,10,9,Cholera due to vibrio cholerae,10,A000,INF003,Bacterial infections
2,11,9,Cholera due to vibrio cholerae el tor,11,A001,DIG001,Intestinal infection
3,11,9,Cholera due to vibrio cholerae el tor,11,A001,INF003,Bacterial infections
4,19,9,"Cholera, unspecified",19,A009,DIG001,Intestinal infection


# Export to S3

In [4]:
s3_key = 'MIMIC/Cleaned/icd_code_map.csv'

# Save the DataFrame to a CSV in memory (using StringIO)
csv_buffer = StringIO()
icd_code_map.to_csv(csv_buffer, index=False)

# Upload the CSV data to S3
s3.put_object(Bucket=bucket_name, Key=s3_key, Body=csv_buffer.getvalue())

print(f"DataFrame successfully exported to s3://{bucket_name}/{s3_key}")

DataFrame successfully exported to s3://ml-stack-759578648427-us-west-2/MIMIC/Cleaned/icd_code_map.csv
