In [1]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf
from scipy import stats
import time
import networkx as nx

In [None]:
# Load original and matched datasets

dataset_loc = '...' # Insert path to dataset
matched_dataset_loc = '...' # Insert path to matched dataset

dx_df = pd.read_csv(dataset_loc)
dx_df['epistart'] = pd.to_datetime(dx_df['epistart']) 
match = pd.read_csv(matched_dataset_loc)

In [9]:
# Create dictionary from dateindex to matchindex
indexes = dict(zip(dx_df['dateindex'].to_list(), dx_df['matchindex'].to_list()))

In [2]:
# 2-disease model

icdlist = [] # Insert list of of lists where each inner list consists of 2 diagnoses to use as 1 node (first 2 in list)
# and a diagnosis to use as target node (last element in list) 
result = []

for icd1, icd2, icd3 in icdlist:
    
    dx_pair_result = []
    dx_pair_result.append(icd1)
    dx_pair_result.append(icd2)
    dx_pair_result.append(icd3)

    # Get dataset for patients with D1/D2 within a year of each other
    a = time.time()
    q = """ 
    SELECT MIN(dateindex) dateindex, patid, MIN(epistart) epistart FROM (
    SELECT a.dateindex, a.matchindex, a.patid, a.epistart, a.icd, a.gender, a.yob, a.gen_ethnicity, a.IP, a.chapter, a.section, a.gen_ethnicity_int, a.section_int,
    b.epistart epistart2, b.icd icd2, b.IP IP2, b.chapter chapter2, b.section section2
    FROM dx_df AS a
    JOIN dx_df as b
    ON a.patid = b.patid
    AND ABS(julianday(b.epistart) - julianday(a.epistart))<365
    AND a.section_int = {}
    AND b.section_int = {}) AS T
    GROUP BY patid
    """.format(icd1, icd2)
    df = sqldf(q)

    # Get corresponding matched patients to the selected patients
    match_df = match[match['matchindex'].isin([indexes.get(key) for key in df['dateindex']])]

    # Count how many of the D1/D2 patients go on to be diagnosed with D3 within 5 years
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start
    FROM df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd3)
    df2 = sqldf(q)
    df2['epistart'] = pd.to_datetime(df2['epistart'])
    df2['ep2_start'] = pd.to_datetime(df2['ep2_start'])
    
    # Just select 1 row per patient
    df3 = df2.loc[df2.groupby('patid').epistart.idxmin()]
    # Count total number of D1/D2 patients who go on to be diagnosed with D3 within 5 years
    num_D1_to_D2 = df3['ep2_icd IS NOT NULL'].sum()
    # Count total number of D1/D2 patients
    Total_D1_patients = len(df3)

    dx_pair_result.append(Total_D1_patients)
    dx_pair_result.append(num_D1_to_D2)

    # Count how many of the matched patients go on to be diagnosed with D3 within 5 years 
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start
    FROM match_df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd3)
    df2 = sqldf(q)
    df2['epistart'] = pd.to_datetime(df2['epistart'])
    df2['ep2_start'] = pd.to_datetime(df2['ep2_start'])
    
    # Just select 1 row per patient
    df3_match = df2.loc[df2.groupby('patid').epistart.idxmin()]
    # Filter out rows where no match was found
    df3_match = df3_match[df3_match["patid"]!=-1]
    # Count total number of matched patients who go on to be diagnosed with D3 within 5 years
    num_D1_to_D2_match = df3_match['ep2_icd IS NOT NULL'].sum()
    # Count total number of matched patients
    Total_matched_patients = len(df3_match)

    # Calculate relative risk
    RR = (num_D1_to_D2*Total_matched_patients)/(num_D1_to_D2_match*D1len)

    dx_pair_result.append(Total_matched_patients)
    dx_pair_result.append(num_D1_to_D2_match)
    dx_pair_result.append(RR)
    print(dx_pair_result)

    result.append(dx_pair_result)
    
# Save csv
# pd.DataFrame(result).to_csv('ICD2nodeResults.csv', index=False)

In [None]:
# 3-disease model

icdlist = [] # Insert list of of lists where each inner list consists of 3 diagnoses to use as 1 node (first 3 in list)
# and a diagnosis to use as target node (last element in list) 
result = []

for icd1, icd2, icd3, icd4 in icdlist:
    
    dx_pair_result = []
    dx_pair_result.append(icd1)
    dx_pair_result.append(icd2)
    dx_pair_result.append(icd3)

    # Get dataset for patients with D1/D2/D3 within a year of each other
    q = """ 
    SELECT MIN(dateindex) dateindex, patid, MIN(epistart) epistart FROM (
    SELECT a.dateindex, a.matchindex, a.patid, a.epistart, a.icd, a.gender, a.yob, a.gen_ethnicity, a.IP, a.chapter, a.section, a.gen_ethnicity_int, a.section_int,
    b.epistart epistart2, b.icd icd2, b.IP IP2, b.chapter chapter2, b.section section2,
    c.epistart epistart3, c.icd icd3, c.IP IP3, c.chapter chapter3, c.section section3
    FROM dx_df AS a
    JOIN dx_df as b
    JOIN dx_df as c
    ON a.patid = b.patid
    AND a.patid = c.patid
    AND ABS(julianday(b.epistart) - julianday(a.epistart))<365
    AND ABS(julianday(c.epistart) - julianday(a.epistart))<365
    AND a.section_int = {}
    AND b.section_int = {}
    AND c.section_int = {}) AS T
    GROUP BY patid
    """.format(icd1, icd2, icd3)
    df = sqldf(q)

    # Get corresponding matched patients to the selected patients
    match_df = match[match['matchindex'].isin([indexes.get(key) for key in df['dateindex']])]

    # Count how many of the D1/D2/D3 patients go on to be diagnosed with D4 within 5 years
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start
    FROM df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd4)
    df2 = sqldf(q)
    df2['epistart'] = pd.to_datetime(df2['epistart'])
    df2['ep2_start'] = pd.to_datetime(df2['ep2_start'])
    
    # Just select 1 row per patient
    df3 = df2.loc[df2.groupby('patid').epistart.idxmin()]
    # Count total number of D1/D2/D3 patients who go on to be diagnosed with D4 within 5 years
    num_D1_to_D2 = df3['ep2_icd IS NOT NULL'].sum()
    # Count total number of D1/D2/D3 patients
    Total_D1_patients = len(df3)

    dx_pair_result.append(Total_D1_patients)
    dx_pair_result.append(num_D1_to_D2)

    # Count how many of the matched patients go on to be diagnosed with D4 within 5 years 
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start
    FROM match_df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd4)
    df2 = sqldf(q)
    df2['epistart'] = pd.to_datetime(df2['epistart'])
    df2['ep2_start'] = pd.to_datetime(df2['ep2_start'])
      
    # Just select 1 row per patient
    df3_match = df2.loc[df2.groupby('patid').epistart.idxmin()]
    # Filter out rows where no match was found
    df3_match = df3_match[df3_match["patid"]!=-1]
    # Count total number of matched patients who go on to be diagnosed with D4 within 5 years
    num_D1_to_D2_match = df3_match['ep2_icd IS NOT NULL'].sum()
    # Count total number of matched patients
    Total_matched_patients = len(df3_match)

    # Calculate relative risk
    RR = (num_D1_to_D2*Total_matched_patients)/(num_D1_to_D2_match*D1len)

    dx_pair_result.append(Total_matched_patients)
    dx_pair_result.append(num_D1_to_D2_match)
    dx_pair_result.append(RR)
    print(dx_pair_result)

    result.append(dx_pair_result)
    
# Save csv    
# pd.DataFrame(result).to_csv('ICD3nodeResults.csv', index=False)

In [None]:
# 4-disease model

icdlist = [] # Insert list of of lists where each inner list consists of 4 diagnoses to use as 1 node (first 4 in list)
# and a diagnosis to use as target node (last element in list) 
result = []

for icd1, icd2, icd3, icd4, icd5 in icdlist:
    
    dx_pair_result = []
    dx_pair_result.append(icd1)
    dx_pair_result.append(icd2)
    dx_pair_result.append(icd3)

    # Get dataset for patients with D1/D2/D3/D4 within a year of each other
    q = """ 
    SELECT MIN(dateindex) dateindex, patid, MIN(epistart) epistart FROM (
    SELECT a.dateindex, a.matchindex, a.patid, a.epistart, a.icd, a.gender, a.yob, a.gen_ethnicity, a.IP, a.chapter, a.section, a.gen_ethnicity_int, a.section_int,
    b.epistart epistart2, b.icd icd2, b.IP IP2, b.chapter chapter2, b.section section2,
    c.epistart epistart3, c.icd icd3, c.IP IP3, c.chapter chapter3, c.section section3,
    d.epistart epistart4, d.icd icd4, d.IP IP4, d.chapter chapter4, d.section section4
    FROM dx_df AS a
    JOIN dx_df as b
    JOIN dx_df as c
    JOIN dx_df as d
    ON a.patid = b.patid
    AND a.patid = c.patid
    AND a.patid = d.patid
    AND ABS(julianday(b.epistart) - julianday(a.epistart))<365
    AND ABS(julianday(c.epistart) - julianday(a.epistart))<365
    AND ABS(julianday(d.epistart) - julianday(a.epistart))<365
    AND a.section_int = {}
    AND b.section_int = {}
    AND c.section_int = {}
    AND d.section_int = {}) AS T
    GROUP BY patid
    """.format(icd1, icd2, icd3, icd4)
    df = sqldf(q)

    # Get corresponding matched patients to the selected patients
    match_df = match[match['matchindex'].isin([indexes.get(key) for key in df['dateindex']])]

    # Count how many of the D1/D2/D3/D4 patients go on to be diagnosed with D5 within 5 years
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start
    FROM df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd5)
    df2 = sqldf(q)
    df2['epistart'] = pd.to_datetime(df2['epistart'])
    df2['ep2_start'] = pd.to_datetime(df2['ep2_start'])
    
    # Just select 1 row per patient
    df3 = df2.loc[df2.groupby('patid').epistart.idxmin()]
    # Count total number of D1/D2/D3/D4 patients who go on to be diagnosed with D5 within 5 years
    num_D1_to_D2 = df3['ep2_icd IS NOT NULL'].sum()
    # Count total number of D1/D2/D3/D4 patients
    Total_D1_patients = len(df3)

    dx_pair_result.append(Total_D1_patients)
    dx_pair_result.append(num_D1_to_D2)

    # Count how many of the matched patients go on to be diagnosed with D5 within 5 years 
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section as ep2_icd, b.section_int AS ep2_icd_int, b.epistart as ep2_start
    FROM match_df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd5)
    df2 = sqldf(q)
    df2['epistart'] = pd.to_datetime(df2['epistart'])
    df2['ep2_start'] = pd.to_datetime(df2['ep2_start'])
     
    # Just select 1 row per patient
    df3_match = df2.loc[df2.groupby('patid').epistart.idxmin()]
    # Filter out rows where no match was found
    df3_match = df3_match[df3_match["patid"]!=-1]
    # Count total number of matched patients who go on to be diagnosed with D5 within 5 years
    num_D1_to_D2_match = df3_match['ep2_icd IS NOT NULL'].sum()
    # Count total number of matched patients
    Total_matched_patients = len(df3_match)

    # Calculate relative risk
    RR = (num_D1_to_D2*Total_matched_patients)/(num_D1_to_D2_match*D1len)

    dx_pair_result.append(Total_matched_patients)
    dx_pair_result.append(num_D1_to_D2_match)
    dx_pair_result.append(RR)
    print(dx_pair_result)

    result.append(dx_pair_result)
    
# Save csv
# pd.DataFrame(result).to_csv('ICD4nodeResults.csv', index=False)