In [1]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf
from scipy import stats
import time
import networkx as nx
import simple_icd_10_cm as cm
import random

In [4]:
num_pats = 120000 # CHOOSE
dataset_loc = '...' # Insert path to preprocessed csv

# Additional preprocessing

dx_df2 = pd.read_csv(dataset_loc)
icd_list_text = dx_df2['section'].value_counts()[:50].index # List of ICD-10 sections to use as nodes in network (top 50)
pats = np.random.choice(dx_df2['patid'].unique(), num_pats, replace=False) # Choose random patients to use
dx_df = dx_df2[dx_df2['patid'].isin(pats)]
dx_df = dx_df.reset_index(drop=True)

# Create section to section_int lookup
lookup = dx_df.groupby(['section', 'section_int']).size().reset_index().rename(columns={0:'count'})

# Convert list of ICD-10 sections to integers
icd1_list = []
icd2_list = []
for icd in icd_list_text:
    to_add = lookup[lookup['section']==icd]['section_int'].tolist()[0] 
    icd1_list.append(to_add)
    icd2_list.append(to_add)

# Model

result = []
for icd2 in icd2_list: # Cycle through icd2_list
    # Create dataframe which includes flag if patient in row is diagnosed with D2 within 5 years
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section_int AS ep2_icd, b.epistart as ep2_start
    FROM dx_df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd2)
    df = sqldf(q)

    # Convert epistart column to datetime datatype
    df['epistart'] = pd.to_datetime(df['epistart'])
    
    for icd1 in icd1_list: # Cycle through icd1_list
        if icd1 == icd2:
            continue # Skip to next iteration if icd1 == icd2
        
        # Dataframe for all D1 patients, with those leading to icd2 indicated
        D1_patients = df.loc[df['section_int'] == icd1]

        # Find earliest episode with D1 per patient
        D1_patients = D1_patients.loc[D1_patients.groupby('patid').epistart.idxmin()]

        # Total D1 patients which lead to D2
        num_D1_to_D2 = D1_patients['ep2_icd IS NOT NULL'].sum()
        
        Total_D1_patients = len(D1_patients)
        
        dx_pair_result = []
        dx_pair_result.append(lookup[lookup['section_int']==icd1]['section'].tolist()[0])
        dx_pair_result.append(lookup[lookup['section_int']==icd2]['section'].tolist()[0])
        dx_pair_result.append(Total_D1_patients)
        dx_pair_result.append(num_D1_to_D2)
        result.append(dx_pair_result)

MainResult = pd.DataFrame(result)
print("Main model finished")

In [None]:
# Create matched dataset
dx_df['epistart'] = pd.to_datetime(dx_df['epistart'])
matched_rows = []
for row in dx_df.itertuples(index=False, name='Pandas'):
    try: # Find rows similar to row in original dataframe subject to conditions
        matched_rows.append(dx_df[
        (row[2] - pd.Timedelta(days = 30) < dx_df['epistart']) &
        (dx_df['epistart'] < row[2] + pd.Timedelta(days = 30)) &
        (dx_df['gender'] == row[5]) &
        (row[6] - 5 < dx_df['yob']) &
        (dx_df['yob'] < row[6] + 5) &
        (dx_df['gen_ethnicity_int'] == row[12]) &
        (dx_df['section_int'] != row[14]) & 
        (dx_df['patid'] != row[0]) &
        (dx_df['IP?'] == row[9])
        ].sample(1))
    except:
        # If no matches, add a blank row
        matched_rows.append(pd.DataFrame(np.array([-1,0,'2000-01-01',0,0,0,0,'NO MATCH',0,0,0,'NO MATCH',
                            0,0,0],ndmin=2), columns = 
                                         ['patid','icd', 'epistart', 'spno', 'num', 
                                          'gender', 'yob', 'gen_ethnicity', 'admimeth',
                                          'IP?', 'chapter', 'section', 'gen_ethnicity_int', 'icd_int',
                                          'section_int']))
# Convert to dataframe
matched_df = pd.DataFrame(np.array(matched_rows).squeeze(), columns = ['patid','icd', 'epistart', 'spno', 'num', 
                                          'gender', 'yob', 'gen_ethnicity', 'admimeth',
                                          'IP?', 'chapter', 'section', 'gen_ethnicity_int', 'icd_int',
                                          'section_int'])
# Change column types
matched_df = matched_df.astype({'patid': 'int64', 'spno': 'int64',
             'gender': 'int64', 'yob': 'int64', 'IP?': 'int64', 'num': 'int64',
             'chapter': 'int64', 'gen_ethnicity_int': 'int64', 'section_int': 'int64',
             'icd_int': 'int64'})

# Rename columns
matched_df = matched_df.rename(columns={"patid": "m_patid", "spno": "m_spno", "epikey": "m_epikey", "epistart": "m_epistart", "epiend": "m_epiend",
                          "icd": "m_icd", "icdx": "m_icdx", "d_order": "m_d_order", "gender": "m_gender", "yob": "m_yob",
                          "gen_ethnicity": "m_gen_ethnicity", "dismeth": "m_dismeth", "admimeth": "m_admimeth", "IP?": "m_IP?",
                          "chapter": "m_chapter", "section": "m_section", "gen_ethnicity_int": "m_gen_ethnicity_int",
                          "section_int": "m_section_int", "icd_int": "m_icd_int", "num": "m_num"})

print("matched df constructed")

In [None]:
# Count number of matched patients who are diagnosed with D2 within 5 years

# Join original and matched dataframes
fulldf = pd.concat([dx_df, matched_df], axis=1, join="inner")

# Convert date columns to datetime datatype
fulldf['epistart'] = pd.to_datetime(fulldf['epistart'])
fulldf['m_epistart'] = pd.to_datetime(fulldf['m_epistart'])

result = []
for icd2 in icd2_list: # Cycle through icd2_list
    # Create dataframe which includes flag if patient in row is diagnosed with D2 within 5 years
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start 
    FROM fulldf AS a
    LEFT JOIN dx_df AS b
    ON a.m_patid = b.patid
    AND b.epistart > a.m_epistart
    AND julianday(b.epistart) - julianday(a.m_epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd2)
    df = sqldf(q)
    
    # Convert epistart column to datetime datatype
    df['epistart'] = pd.to_datetime(df['epistart'])
    
    for icd1 in icd1_list:
        if icd1 == icd2:
            continue # Skip to next iteration if icd1 == icd2
        
        # Dataframe for all D1 patients with their matches and those leading to D2 indicated
        D1_patients = df.loc[df['section_int'] == icd1]

        # Find earliest episode with D1 per patient
        D1_patients = D1_patients.loc[D1_patients.groupby('patid').epistart.idxmin()]
        
        # Filter out NO MATCH rows
        D1_patients = D1_patients[D1_patients["m_patid"]!=-1]

        # Total D1 patients which lead to D2
        num_D1_to_D2 = D1_patients['ep2_icd IS NOT NULL'].sum()
        
        dx_pair_result = []
        dx_pair_result.append(lookup[lookup['section_int']==icd1]['section'].tolist()[0])
        dx_pair_result.append(lookup[lookup['section_int']==icd2]['section'].tolist()[0])
        D1_V2 = D1_patients[D1_patients['m_section_int']!=icd2] # Filter out rows where matched diagnosis == D2
        dx_pair_result.append(len(D1_V2))
        dx_pair_result.append(D1_V2['ep2_icd IS NOT NULL'].sum())
        
        result.append(dx_pair_result)

MatchResult = pd.DataFrame(result) # Convert results to dataframe 
MainResult = MainResult.rename(columns = {0:'D1', 1:'D2', 2:'Total_D1_patients', 3:'D1toD2'})
MatchResult = MatchResult.rename(columns = {0:'D1', 1:'D2', 2:'matchlen', 3:'matchtoD2'})

# Combine results into one df
MainResult = pd.concat([MainResult, MatchResult['matchlen']], axis=1)
MainResult = pd.concat([MainResult, MatchResult['matchtoD2']], axis=1)

In [None]:
# Calculate relative risk as well as RR confidence intervals

def RelRisk(row):
    if row['matchtoD2'] > 0:
        return (row['D1toD2']/row['Total_D1_patients'])/(row['matchtoD2']/row['matchlen'])
    else:
        return 1
MainResult['RR'] = MainResult.apply(lambda row: RelRisk(row), axis=1)

def LogCI(row): # Calculate SE(log(RR))
    if row['matchtoD2'] == 0 or row['D1toD2'] == 0:
        return float("inf")
    else:
        return np.sqrt(1/row['D1toD2']-1/row['Total_D1_patients']+1/row['matchtoD2']-1/row['matchlen'])
    
MainResult['Log_CI'] = MainResult.apply(lambda row: LogCI(row), axis=1)

def CImin(row): # Lower bound for CI
    if row['RR']-4.26*row['Log_CI']>=0: # Change 4.26 to appropriate (Bonferroni-corrected) z-score
        return np.exp(np.log(row['RR']-4.26*row['Log_CI']))
    else:
        return 0
MainResult['CImin'] = MainResult.apply(lambda row: CImin(row), axis=1)

def CImax(row): # Upper bound for CI
    if row['RR']+4.26*row['Log_CI']>=0:
        return np.exp(np.log(row['RR']+4.26*row['Log_CI']))
    else:
        return 10
MainResult['CImax'] = MainResult.apply(lambda row: CImax(row), axis=1)

def CIover1(row): # Indicate if CI crosses over 1
    if row['CImin'] <= 1 and row['CImax'] <= 1:
        return 1
    elif row['CImin'] >= 1 and row['CImax'] >= 1:
        return 1
    else:
        return 0
MainResult['CIover1'] = MainResult.apply(lambda row: CIover1(row), axis=1)

def CIsize(row):
    if row['CImax'] == float("inf"):
        return 10
    else:
        return row['CImax'] - row['CImin']
MainResult['CIsize'] = MainResult.apply(lambda row: CIsize(row), axis=1)

# Save to csv
# MainResult.to_csv('ICDSec{}Results.csv'.format(num_pats), index=False)

# Final result metrics to report
metrics = []
metrics.append(len(fulldf)) # Total dataset size
metrics.append(1-np.sum(matched_df['m_patid']==-1)/len(matched_df)) # Percentage of matched patients found
metrics.append(np.mean(MainResult['CIsize'])) # Av RR interval size
metrics.append(np.sum(MainResult['CIover1'])/len(MainResult)) # Percentage CIs that do not cross 1
# pd.DataFrame(metrics).to_csv('ICDSec{}Metrics.csv'.format(num_pats), index=False)

In [None]:
# Compare results with another network

df1 = MainResult
df2 = pd.read_csv('...') # Load another network
# Compare adjacency matrices using various metrics
df1 = df1.rename(columns={'RR':"RR2"}) # Change column name so no clash
df2 = pd.concat([df1['RR2'], MainResult['RR']], axis=1, join="inner")
dists = []
dists.append(np.sqrt(np.sum(df2.apply(lambda row: (row['RR2']-row['RR'])**2 ,axis=1)))) # Euclidean
dists.append(np.sum(df2.apply(lambda row: abs(row['RR2']-row['RR']) ,axis=1))) # Manhattan
dists.append(1-np.sum(df2.apply(lambda row: min(row['RR2'],row['RR']),axis=1))/np.sum(df2.apply(lambda row: max(row['RR2'],row['RR']),axis=1))) # Weighted Jaccard
# pd.DataFrame(dists).to_csv('...'.format(num_pats), index=False) # Save distance metric results