# Speed tests for DIAG vars

In [143]:
import pandas as pd
import numpy as np
import sys
import json
import collections

def get_outcomes():
    """ Get and return ICD codes """""
    file = open('../src/icd_codes.json')
    outcomes_ = json.load(file)
    file.close()
    return json.loads(outcomes_[0])

outcomes = get_outcomes()
icd_date = pd.Timestamp(year=2015, month=10, day=1)
diags = ["DIAG" + str(num) for num in np.arange(1, 11)]

In [127]:
cols = ['QID','ADATE','DDATE','YEAR','DIAG1','DIAG2','DIAG3','DIAG4','DIAG5','DIAG6','DIAG7','DIAG8','DIAG9','DIAG10']
filename = "../data/medpar/medpar_2016.csv"
df = pd.read_csv(filename, usecols=cols, nrows=60000)
df['ADATE'] = pd.to_datetime(df['ADATE'], format='%d%b%Y')
df['DDATE'] = pd.to_datetime(df['DDATE'], format='%d%b%Y')

## Radix tree

In [128]:
def add_radix_tree(tree, split):
    if split:
        node = split.popleft()
        if node in tree:
            return add_radix_tree(tree[node], split)
        else:
            tree[node] = {}
            return add_radix_tree(tree[node], split)

    return collections

def build_icd_vocab(codes):
    tree = {}
    for code in codes:
        split = collections.deque(list(code))
        add_radix_tree(tree, split)
    return tree

def codeOfInterest(codeTrie, code):
    #print(code)
    tempTrie = codeTrie
    for letter in code:
        if tempTrie:
            tempTrie = tempTrie.get(letter)
        else: 
            break
    if tempTrie or tempTrie=={}:
        return True
    else:
        return False
    
vectorCodeOfInterest = np.vectorize(codeOfInterest)

In [129]:
def primary_secondary2(primary, secondary, disease_type, outcomes):
    codes = outcomes.get(disease_type).get("icd9") + outcomes.get(disease_type).get("icd10")
    codeTrie = build_ICD_vocab(codes)
    startIndex = df.columns.get_loc("DIAG1")
    endIndex = startIndex
    if primary and not secondary:
        endIndex = startIndex +1
    if secondary and not primary:
        startIndex = startIndex+1
        endIndex = startIndex+9
    if primary and secondary:
        endIndex = startIndex+10
    #oldway
    #return np.isin(medPars.iloc[:, startIndex:endIndex], ["K254"]).any(axis=1)
    returnCol = np.zeros(len(df["DIAG1"]))
    for col in df.iloc[:, startIndex:endIndex]:
        #print(col)
        returnCol += vectorCodeOfInterest(codeTrie, df[col].astype('str'))
    return returnCol.astype('bool')

In [130]:
%%time
df["aki_primary_secondary_trie"] = primary_secondary2(True, True, "aki", outcomes)

CPU times: user 233 ms, sys: 3.96 ms, total: 237 ms
Wall time: 235 ms


In [34]:
len(primary_secondary_aki)

60000

## Dict

In [131]:
def primary_secondary(row, outcome=None):
    """ Check all diags - from DIAG1-10 """
    for diag in diags:
        if row["DDATE"] < icd_date and \
            row[diag] in outcomes[outcome]["icd9"]:
            return True
        if row["DDATE"] >= icd_date and \
            row[diag] in outcomes[outcome]["icd10"]:
            return True
    return False 

In [132]:
%%time
df["aki_primary_secondary"] = df.apply(
    primary_secondary, axis=1, outcome="aki")

CPU times: user 4.54 s, sys: 29.7 ms, total: 4.57 s
Wall time: 4.58 s


In [136]:
df['aki_primary_secondary'].equals(df['aki_primary_secondary_trie'])

True

## Sets

In [144]:
def primary_secondary_set(outcome=None):
    """ Check all diags - from DIAG1-10 """
    outcomes_set = outcomes[outcome]["icd10"] + outcomes[outcome]["icd9"]

    return_col = pd.Series([False] * len(df))
    for col in diags:
        return_col = return_col | df[col].isin(outcomes_set)
    return return_col 

In [145]:
%%time
df["aki_primary_secondary_set"] = primary_secondary_set(outcome="aki")

CPU times: user 57.6 ms, sys: 2.99 ms, total: 60.6 ms
Wall time: 58.3 ms


In [146]:
df['aki_primary_secondary_set'].equals(df['aki_primary_secondary'])

True

# Compare existing and new DIAG vars

In [158]:
# newst DIAG vars

li = []
for year in range(2000, 2017):
    filename = "../data/medpar_vars/medpar_" + str(year) + "_sets.parquet"
    df = pd.read_parquet(filename, columns=['aki_primary_secondary'])
    li.append(df)
admissions = pd.concat(li, axis=0, ignore_index=True)

In [159]:
admissions['aki_primary_secondary'].value_counts()

False    121622157
True      15888017
Name: aki_primary_secondary, dtype: int64

In [None]:
# old DIAG vars:

In [196]:
df3= pd.read_csv("../data/medpar_vars/medpar_n2015.csv")

In [197]:
filename = "../data/medpar_vars/medpar_2015_sets.parquet"
df1 = pd.read_parquet(filename)

In [168]:
pom1 = df1[['QID','all_kidney_primary']]
pom2 = df2[['QID','all_kidney_primary']]

In [173]:
len(pom2['QID'].unique()) # 5249472 vs 5830981

5830981

In [198]:
of_interest_cols = [outcome + "_secondary" for outcome in outcomes]
of_interest_cols = of_interest_cols + [outcome + "_primary" for outcome in outcomes]
mask = df1[of_interest_cols].any(axis=1)
mask.value_counts()

True    7157199
dtype: int64

## Diag vars are all the same!!

In [181]:
# df1 -- True    8678219
# df2 -- True     8678219
# df2 -- False    1064568
# df3 (csv) -- True 8678219

In [212]:
df1['aki_primary'].value_counts()

False    6945916
True      211283
Name: aki_primary, dtype: int64

In [213]:
df3['aki_primary'].value_counts()

False    6945891
True      211283
Name: aki_primary, dtype: int64

In [202]:
len(df1)

7157199

Therefore the error is probably with first_hosp count or denom calc

## Compare dask / no_dask vars

In [229]:
df=pd.read_csv("../data/medpar_all/medpar_no_dask.csv",usecols=['aki_primary_first_hosp'])

In [230]:
print(df['aki_primary_first_hosp'].value_counts())

False    135275329
True       2234820
Name: aki_primary_first_hosp, dtype: int64

In [233]:
df=pd.read_csv("../data/medpar_all/medpar.csv",usecols=['aki_secondary_first_hosp'])

In [236]:
print(df['aki_secondary_first_hosp'].value_counts())

False    129012604
True       8497545
Name: aki_secondary_first_hosp, dtype: int64


## medpar_no_dask.csv

aki_primary_first_hosp
```
False    135275329
True       2234820
```

## medpar.csv

aki_primary_first_hosp
```
False    135275329
True       2234820
```

## medpar.csv
aki_secondary_first_hosp
```
False    129012604
True       8497545
```

In [235]:
2234820+8497545

10732365

In [239]:
# Ben's total AKI first hosp (primary-secondary in one) 
#  9,272,274
# 10,732,365

### Diag vars were FINE the whole time, but now they are faster.