In [1]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf
from scipy import stats
import time
import networkx as nx

In [3]:
dataset_loc = '...'
dx_df = pd.read_csv(dataset_loc)

icd_list = [[]] # Insert list of diagnosis pairs to test directionality for eg. for pairs with RR > 1 in either/both directions

dir_result = []

# For each pair, find patients who are diagnosed with each diagnosis in the pair at least once
# and add columns for which diagnosis appears first
for icd1, icd2 in icd_list:
    q = """
    SELECT MIN(a.patid), MIN(a.epistart), MAX(a.chapter), MAX(b.patid), MIN(b.epistart), MAX(b.chapter),
    MIN(a.epistart)<MIN(b.epistart) as D1first, MIN(a.epistart)>MIN(b.epistart) as D2first
    FROM dx_df AS a
    JOIN dx_df AS b
    ON a.patid = b.patid
    WHERE a.chapter = {}
    AND b.chapter = {}
    GROUP BY a.patid
    LIMIT 10000000;
    """.format(icd1,icd2)
    pairdf = sqldf(q)
    D1first = pairdf['D1first'].sum() # Sum number of times D1 diagnosed first
    D2first = pairdf['D2first'].sum() # Sum number of times D2 diagnosed first
    total = len(pairdf) # Total number of patients with both diagnoses
    D1pval = stats.binom_test(D1first, n=total, p = 0.5, alternative='greater') # p-value from binomial test for D1
    D2pval = stats.binom_test(D2first, n=total, p = 0.5, alternative='greater') # p-value from binomial test for D2
    # Record results
    pair_result = []
    pair_result.append(icd1)
    pair_result.append(icd2)
    pair_result.append(D1first)
    pair_result.append(D2first)
    pair_result.append(total)
    pair_result.append(D1pval)
    pair_result.append(D2pval)
    dir_result.append(pair_result)
    print(pair_result)

# pd.DataFrame(dir_result).to_csv('ICDChaptersDirResults.csv', index=False)   