In [4]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf
from scipy import stats
import time
import simple_icd_10_cm as cm

In [5]:
dataset_loc = '...'

dx_df = pd.read_csv(dataset_loc)

In [4]:
# Remove maternity/NA admission method codes

In [8]:
matNAcodes = ['31','32','82','99','98','83']
dx_df = dx_df[~dx_df['admimeth'].isin(matNAcodes)]

In [10]:
# Create flag for whether IP or ED based on admimeth

In [11]:
IP_codes = ['11','12','13','81']
dx_df['IP?'] = np.where(dx_df['admimeth'].isin(IP_codes), 1, 0)

In [None]:
# Manually add ICD chapters

In [14]:
def icd_chapter(row):
    if row['icd'][0] == 'A' or row['icd'][0] == 'B':
        return 1
    if row['icd'][0] == 'C' :
        return 2
    if row['icd'][0] == 'D' and int(row['icd'][1]) <= 4:
        return 2
    if row['icd'][0] == 'D' and int(row['icd'][1]) > 4:
        return 3
    if row['icd'][0] == 'E':
        return 4
    if row['icd'][0] == 'F':
        return 5
    if row['icd'][0] == 'G':
        return 6
    if row['icd'][0] == 'H' and int(row['icd'][1]) <= 5:
        return 7
    if row['icd'][0] == 'H' and int(row['icd'][1]) > 5:
        return 8
    if row['icd'][0] == 'I':
        return 9
    if row['icd'][0] == 'J':
        return 10
    if row['icd'][0] == 'K':
        return 11
    if row['icd'][0] == 'L':
        return 12
    if row['icd'][0] == 'M':
        return 13
    if row['icd'][0] == 'N':
        return 14
    if row['icd'][0] == 'O':
        return 15
    if row['icd'][0] == 'P':
        return 16
    if row['icd'][0] == 'Q':
        return 17
    if row['icd'][0] == 'R':
        return 18
    if row['icd'][0] == 'S' or row['icd'][0] == 'T':
        return 19
    if row['icd'][0] == 'V' or row['icd'][0] == 'W' or row['icd'][0] == 'X' or row['icd'][0] == 'Y':
        return 20
    if row['icd'][0] == 'Z':
        return 21
    if row['icd'][0] == 'U':
        return 22
    return 'blank'

In [1]:
dx_df['chapter'] = dx_df.apply(lambda row: icd_chapter(row), axis=1)

In [16]:
# Remove certain ICD chapters from dataset

In [18]:
dx_df = dx_df[(dx_df['chapter']!=16) & (dx_df['chapter']!=20) & (dx_df['chapter']!=21) & (dx_df['chapter']!=22)]

In [None]:
# Add ICD section column to dataset

In [20]:
def icd_section(row):
    try:
        return cm.get_parent(row['icd'])
    except:
        if row['icd'] == 'I84':
            return 'I80-I89'
        if row['icd'] == 'F00':
            return 'F01-F09'
        if row['icd'] == 'I64':
            return 'I60-I69'
        if row['icd'] == 'E14':
            return 'E08-E13'
        if row['icd'] == 'R02':
            return 'R00-R09'
        if row['icd'] == 'K07':
            return 'I60-I69'
        else:
            return row['icd']

In [2]:
dx_df['section'] = dx_df.apply(icd_section,axis=1)

In [None]:
# Convert some textual columns to ints so that search algorithms run faster

In [22]:
sequential = pd.Series(dx_df["gen_ethnicity"].unique()).reset_index().rename(columns={0: "gen_ethnicity"})
dx_df = dx_df.merge(sequential, on="gen_ethnicity")
dx_df = dx_df.rename(columns={"index": "gen_ethnicity_int"})

In [23]:
sequential = pd.Series(dx_df["icd"].unique()).reset_index().rename(columns={0: "icd"})
dx_df = dx_df.merge(sequential, on="icd")
dx_df = dx_df.rename(columns={"index": "icd_int"})

In [24]:
sequential = pd.Series(dx_df["section"].unique()).reset_index().rename(columns={0: "section"})
dx_df = dx_df.merge(sequential, on="section")
dx_df = dx_df.rename(columns={"index": "section_int"})

In [26]:
# Save csv
# dx_df.to_csv('dx_df.csv', index=False)

In [32]:
# Create section to section_int lookup
# dx_df.groupby(['section', 'section_int']).size().reset_index().rename(columns={0:'count'}).to_csv('section_to_int_lookup.csv',index=False)

In [34]:
dx_df['section'].value_counts()

R50-R69    66246
R10-R19    64867
I30-I52    59562
K55-K64    57327
K20-K31    57109
           ...  
M73            1
R96            1
R95            1
O83            1
G22            1
Name: section, Length: 275, dtype: int64