In [2]:
import pandas as pd
import numpy as np
import os
from pandasql import sqldf
from scipy import stats
import time
import networkx as nx

In [15]:
dataset_loc = '...' # Insert path to preprocessed dataset as csv

bnf_df = pd.read_csv(dataset_loc)
bnf_df['eventdate'] = pd.to_datetime(bnf_df['eventdate'])

In [12]:
# Helper fn - Count number of possible matching rows in df for a given row
def num_cond_rows(df, col1, col2, col3, col4, col5, col6, col7, val1, val2, val3, val4, val5, val6, val7):
    temp = df[
    (val1 - pd.Timedelta(days = 30) < df[col1]) &
    (df[col1] < val1 + pd.Timedelta(days = 30)) &
    (df[col2] == val2) &
    (val3 - 5 < df[col3]) &
    (df[col3] < val3 + 5) &
    (df[col4] == val4) &
    (df[col5] != val5) &
    (df[col6] != val6) &
    (df[col7] == val7)
    ]

    # Limit to one prescription per patient (choose randomly)
    temp_per_pat = temp.groupby('patid', as_index = False).agg(np.random.choice)
     
    temp_list = []
    temp_list.append(temp['patid'].nunique()) # Total number of possible matching patients
    temp_list.append(temp_per_pat['ep2_icd IS NOT NULL'].sum()) # Total number of these patients who are presribed with D2 within 5 yrs
    
    return temp_list

In [3]:
bnf1_list = bnf_df['section_int'].value_counts()[:50].index # Top 50 most common BNF sections
bnf2_list = bnf_df['section_int'].value_counts()[:50].index
result = []

    q = """
    SELECT *, event2_bnf IS NOT NULL FROM (
    SELECT a.*, b.bnfsection AS event2_bnf, b.eventdate as event2_date
    FROM bnf_df AS a
    LEFT JOIN bnf_df AS b
    ON a.patid = b.patid
    AND b.eventdate > a.eventdate
    AND julianday(b.eventdate) - julianday(a.eventdate) < 1825
    AND b.bnfsection = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """

for bnf2 in bnf2_list: # Cycle through bnf2_list
    # Dataframe for all patients and include flag if they are prescribed with D2 within 5 yrs
    q = """
    SELECT *, event2_bnf IS NOT NULL FROM (
    SELECT a.*, b.bnfsection AS event2_bnf, b.eventdate as event2_date
    FROM bnf_df AS a
    LEFT JOIN bnf_df AS b
    ON a.patid = b.patid
    AND b.eventdate > a.eventdate
    AND julianday(b.eventdate) - julianday(a.eventdate) < 1825
    AND b.bnfsection = {}
    LIMIT 100000000) AS T
    LIMIT 100000000;
    """.format(bnf2)
    df = sqldf(q)

    # Convert epistart and end cols to datetime datatype
    df['eventdate'] = pd.to_datetime(df['eventdate'])
    
    for bnf1 in bnf1_list:
        if bnf1 == bnf2:
            continue # Skip to next iteration if bnf1 == bnf2
    
        # Dataframe for all D1 patients, with those leading to D2 indicated
        D1_patients = df.loc[df['section_int'] == bnf1]

        # Find earliest episode with D1 per patient
        D1_patients = D1_patients.loc[D1_patients.groupby('patid').eventdate.idxmin()]

        # Total D1 patients which lead to D2
        num_D1_to_D2 = D1_patients['ep2_icd IS NOT NULL'].sum()
        
        # Just choose necessary columns and convert to array to iterate over
        D1 =  D1[["patid", "eventdate", "gender", "yob", "gen_ethnicity", "bnfsection", "event2_bnf IS NOT NULL"]]
        D1_patients = D1_patients.to_numpy()

        # Shuffle D1_patients so that the chosen C rows are random   
        np.random.shuffle(D1_patients)

        # Create list of number of matching patients per row in df
        # And list of how many of these lead to D2
        n_match=[]
        for i in D1:
            to_add = num_cond_rows(df, 'eventdate', 'gender', 'yob', 'gen_ethnicity', 'bnfsection', 'patid',
                                  i[1], i[2], i[3], i[4], i[5], i[0])
            
            # Only consider these comparison patients if there are more than 10 similar to the exposure group encounter
            if to_add[0]>10:
                n_match.append(to_add)
                
            if len(n_match) > 50: # Choose C=50
                break

        Total_D1_patients = len(D1)
        # Calc p-value
        p_est = np.mean([x[1]/x[0] if x[0]!=0 else 0 for x in n_match]) # Average the calculated proportions
        pval = stats.binom_test(num_D1_to_D2, n=Total_D1_patients, p = p_est, alternative='greater')
        
        # Save results
        bnf_pair_result = []
        bnf_pair_result.append(bnf1)
        bnf_pair_result.append(bnf2)
        bnf_pair_result.append(Total_D1_patients)
        bnf_pair_result.append(num_D1_to_D2)
        bnf_pair_result.append(p_est)
        bnf_pair_result.append(pval)
        print(bnf_pair_result)
        
        result.append(bnf_pair_result)
        
# pd.DataFrame(result).to_csv('BNFSectionsAltResults.csv', index=False)        