In [2]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf
from scipy import stats
import time
import networkx as nx

In [15]:
dataset_loc = '...' # Insert path to preprocessed dataset as csv

dx_df = pd.read_csv(dataset_loc)
dx_df['epistart'] = pd.to_datetime(dx_df['epistart'])

In [12]:
# Helper fn - Count number of possible matching rows in df for a given row
def num_cond_rows(df, col1, col2, col3, col4, col5, col6, col7, val1, val2, val3, val4, val5, val6, val7):
    temp = df[
    (val1 - pd.Timedelta(days = 30) < df[col1]) &
    (df[col1] < val1 + pd.Timedelta(days = 30)) &
    (df[col2] == val2) &
    (val3 - 5 < df[col3]) &
    (df[col3] < val3 + 5) &
    (df[col4] == val4) &
    (df[col5] != val5) &
    (df[col6] != val6) &
    (df[col7] == val7)
    ]

    # Limit to one diagnosis per patient (choose randomly)
    temp_per_pat = temp.groupby('patid', as_index = False).agg(np.random.choice)
     
    temp_list = []
    temp_list.append(temp['patid'].nunique()) # Total number of possible matching patients
    temp_list.append(temp_per_pat['ep2_icd IS NOT NULL'].sum()) # Total number of these patients who are diagnosed with D2 within 5 yrs
    
    return temp_list

In [3]:
icd1_list = dx_df['section_int'].value_counts()[:50].index # Top 50 most common ICD-10 sections
icd2_list = dx_df['section_int'].value_counts()[:50].index
result = []

for icd2 in icd2_list: # Cycle through icd2_list
    # Dataframe for all patients and include flag if they are diagnosed with D2 within 5 yrs
    q = """
    SELECT *, ep2_icd IS NOT NULL FROM (
    SELECT a.*, b.patid AS patid_dup, b.section_int AS ep2_icd, b.epistart as ep2_start
    FROM dx_df AS a
    LEFT JOIN dx_df AS b
    ON a.patid = b.patid
    AND b.epistart > a.epistart
    AND julianday(b.epistart) - julianday(a.epistart) < 1825
    AND b.section_int = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(icd2)
    df = sqldf(q)

    # Convert epistart and end cols to datetime datatype
    df['epistart'] = pd.to_datetime(df['epistart'])
    
    for icd1 in icd1_list:
        if icd1 == icd2:
            continue # Skip to next iteration if icd1 == icd2
    
        # Dataframe for all D1 patients, with those leading to D2 indicated
        D1_patients = df.loc[df['section_int'] == icd1]

        # Find earliest episode with D1 per patient
        D1_patients = D1_patients.loc[D1_patients.groupby('patid').epistart.idxmin()]

        # Total D1 patients which lead to D2
        num_D1_to_D2 = D1_patients['ep2_icd IS NOT NULL'].sum()
        
        # Just choose necessary columns and convert to array to iterate over
        D1_patients =  D1_patients[["patid", "epistart", "gender", "yob", "gen_ethnicity_int", "section_int", "IP?", "ep2_icd IS NOT NULL"]]
        D1_patients = D1_patients.to_numpy()

        # Shuffle D1_patients so that the chosen C rows are random   
        np.random.shuffle(D1_patients)

        # Create list of number of matching patients per row in df
        # And list of how many of these lead to D2
        n_match=[]
        for i in D1:
            to_add = num_cond_rows(df, 'epistart', 'gender', 'yob', 'gen_ethnicity_int', 'section_int', 'patid', 'IP?',
                                  i[1], i[2], i[3], i[4], i[5], i[0], i[6])
            
            # Only consider these comparison patients if there are more than 10 similar to the exposure group encounter
            if to_add[0]>10:
                n_match.append(to_add)
                
            if len(n_match) > 50: # Choose C=50
                break

        Total_D1_patients = len(D1)
        # Calc p-value
        p_est = np.mean([x[1]/x[0] if x[0]!=0 else 0 for x in n_match]) # Average the calculated proportions
        pval = stats.binom_test(num_D1_to_D2, n=Total_D1_patients, p = p_est, alternative='greater')
        
        # Save results
        dx_pair_result = []
        dx_pair_result.append(hes_dx['section'].unique()[icd1]) # Save name of ICD-10 section
        dx_pair_result.append(hes_dx['section'].unique()[icd2])
        dx_pair_result.append(Total_D1_patients)
        dx_pair_result.append(num_D1_to_D2)
        dx_pair_result.append(p_est)
        dx_pair_result.append(pval)
        print(dx_pair_result)
        
        result.append(dx_pair_result)
        
# pd.DataFrame(result).to_csv('ICDSectionsAltResults.csv', index=False)        