## To-Do:
- consider using Bayesian probabilities instead of odds ratio

In [92]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import time
from datetime import datetime
import re
import pprint
import numpy as np
import multiprocessing
from multiprocessing import  Pool

In [3]:
# Runing this cell will ask you to provide the database password. If you need the password, email Tim McLerran at tmclerran@gmail.com to provide evidence 
# that you have been granted access to MIMIC-III data by physionet. If you are unsure how to proceed, just ask the #nlp channel on Slack
import getpass
password = getpass.getpass("\nPlease enter the Neo4j database password to continue \n")

# Create a connection to the working group's Neo4j database of MIMIC-III data
from neo4j import GraphDatabase
driver=GraphDatabase.driver(uri="bolt://76.251.77.235:7687", auth=('neo4j',password))
session=driver.session()


Please enter the Neo4j database password to continue 
 ············


# Input known information to develop problem list
---

## Input problems, output associated problems

### Based on problem name

In [16]:
def comorbidities_of(prob_list):
    
    query = '''
    MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WHERE prob1.description in {prob_list}
    WITH distinct(pt) AS patients
    MATCH (patients)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(prob_list=prob_list)
    comorbidities = session.run(query)
    comorbidities = pd.DataFrame([dict(record) for record in comorbidities])
    
    query = '''
    MATCH (excluded:Problem)
    WHERE excluded.description in {prob_list}
    WITH collect(excluded) as excluded
    MATCH (pt:Patients)-[:HAS_PROBLEM]->(prob:Problem)
    WITH excluded, pt, collect(prob) as problems
    WHERE NONE (prob in problems where prob in excluded)
    MATCH (pt)-[:HAS_PROBLEM]-(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(prob_list=prob_list)
    gen_problems = session.run(query)
    gen_problems = pd.DataFrame([dict(record) for record in gen_problems])
    
    gen_pop_total = sum(gen_problems['Number'])
    gen_problems['Gen_pop_proportion'] = gen_problems['Number']/gen_pop_total
    
    gen_problems = gen_problems[gen_problems['Number'] > 50]
    
    comorb_total = sum(comorbidities['Number'])
    comorbidities['Comorbidities_proportion'] = comorbidities['Number']/comorb_total
    
    comorbidities = comorbidities[comorbidities['Number'] > 200/len(prob_list)]
    
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    comorbidities = pd.merge(comorbidities, gen_problems, on=['CUI', 'Description'])
    
    comorbidities['Odds_Ratio'] = (comorbidities['Comorbidities_proportion']/comorbidities['Gen_pop_proportion'])
    
  
    comorbidities.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
    return comorbidities.head(20)
    
  

In [19]:
start_time = time.time()

prob_list = ['Diabetes Mellitus', 'Peripheral Vascular Diseases']
watch_out_for_these = comorbidities_of(prob_list)

print("Total runtime:", time.time() - start_time, "seconds")
# watch_out_for_these.loc[:,['Description', 'CUI', 'Odds_Ratio']]
watch_out_for_these

Total runtime: 2.185105085372925 seconds


Unnamed: 0,Description,CUI,Number_x,Comorbidities_proportion,Number_y,Gen_pop_proportion,Odds_Ratio
150,Restless Legs Syndrome,C0035258,130,0.000989,70,0.000115,8.637903
178,Dyspnea on exertion,C0231807,111,0.000845,79,0.000129,6.5352
118,Postherpetic neuralgia,C0032768,181,0.001377,140,0.000229,6.01331
183,Superinfection,C0038826,109,0.000829,87,0.000142,5.827339
95,Parkinson Disease,C0030567,249,0.001895,203,0.000332,5.70514
145,Heart Diseases,C0018799,136,0.001035,126,0.000206,5.02032
106,Osteomyelitis,C0029443,210,0.001598,205,0.000335,4.764622
92,Neuropathy,C0442874,254,0.001933,266,0.000435,4.441351
188,Vascular Diseases,C0042373,106,0.000807,133,0.000218,3.706954
25,Diabetes,C0011847,1030,0.007838,1343,0.002197,3.567174


### Based on problem CUI

#### Find CUIs for the problem(s) of interest

In [6]:
# Define a function that conducts a fuzzy fulltext search on an index of UMLS strings

def string_umls_search_problems(target):
    query = '''
    CALL db.index.fulltext.queryNodes("Pt_Problems", 'description:{target}') YIELD node, score
    RETURN node.description as Description, node.cui AS cui, count(node) as freq, score
    LIMIT 10'''.format(target=target)
    data = session.run(query)
    return pd.DataFrame([dict(record) for record in data])

In [12]:
# Let user type in free text the name of the problem
start_time = time.time()
result = string_umls_search_problems('Diabetes Mellitus')
print("Total runtime:", time.time() - start_time, "seconds")

# Return a list of terms that may match the problem the user has identified
result

Total runtime: 0.04702615737915039 seconds


Unnamed: 0,Description,cui,freq,score
0,Diabetes Mellitus,C0011849,4865,3.926992
1,"Diabetes Mellitus, Insulin-Dependent",C0011854,1412,2.766761
2,"Diabetes Mellitus, Non-Insulin-Dependent",C0011860,1717,2.410647
3,Gastroparesis due to diabetes mellitus,C0267176,13,2.410647
4,Diabetes,C0011847,2373,2.408763
5,Diabetes Insipidus,C0011848,78,1.90371
6,Brittle diabetes,C0342302,4,1.90371
7,Central Diabetes Insipidus,C0687720,2,1.573739
8,Diabetes with coma (disorder),C1263960,1,1.341258


#### Find likely comorbidities of the problem

In [45]:
def comorbidities_of_CUI(cui_prob_list):
    
    query = '''
    MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WHERE prob1.cui in {cui_prob_list}
    WITH distinct(pt) AS patients
    MATCH (patients)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(cui_prob_list=cui_prob_list)
    comorbidities = session.run(query)
    comorbidities = pd.DataFrame([dict(record) for record in comorbidities])
    
    query = '''
    MATCH (excluded:Problem)
    WHERE excluded.cui in {cui_prob_list}
    WITH collect(excluded) as excluded
    MATCH (pt:Patients)-[:HAS_PROBLEM]->(prob:Problem)
    WITH excluded, pt, collect(prob) as problems
    WHERE NONE (prob in problems where prob in excluded)
    MATCH (pt)-[:HAS_PROBLEM]-(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(cui_prob_list=cui_prob_list)
    gen_problems = session.run(query)
    gen_problems = pd.DataFrame([dict(record) for record in gen_problems])
    
    gen_pop_total = sum(gen_problems['Number'])
    gen_problems['Gen_pop_proportion'] = gen_problems['Number']/gen_pop_total
    
    gen_problems = gen_problems[gen_problems['Number'] > 50]
    
    comorb_total = sum(comorbidities['Number'])
    comorbidities['Comorbidities_proportion'] = comorbidities['Number']/comorb_total
    
    comorbidities = comorbidities[comorbidities['Number'] > 200/len(cui_prob_list)]
    
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    comorbidities = pd.merge(comorbidities, gen_problems, on=['CUI', 'Description'])
    
    comorbidities['Odds_Ratio'] = (comorbidities['Comorbidities_proportion']/comorbidities['Gen_pop_proportion'])
    comorbidities.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
    return comorbidities.loc[:,['Description', 'Odds_Ratio']].head(20)

In [46]:
cui_prob_list = ['C0018802']
comorbidities_of_CUI(cui_prob_list)

Unnamed: 0,Description,Odds_Ratio
69,Pulmonary Hypertension,4.757166
103,"Heart Failure, Diastolic",4.475324
12,Heart failure,4.147978
67,Aortic Valve Stenosis,3.972401
75,Obesity,3.316616
50,Cardiomyopathies,3.278133
90,Glaucoma,2.723645
68,Coronary Artery Disease,2.711165
79,Normocytic anemia,2.621233
41,Chronic anemia,2.478086


## Input prescribed medications, output likely problems

In [31]:
# # Create a fulltext index for the string property of the nodes labeled String_UMLS
# command = '''CALL db.index.fulltext.createNodeIndex("Drug_Names", ["Prescriptions"], ["DRUG"])'''
# session.run(command)

<neo4j.work.result.Result at 0x7f793d80d220>

In [11]:
# Define a function that conducts a fuzzy fulltext search on an index of drug names

def search_drug_names(target):
    query = '''
    CALL db.index.fulltext.queryNodes("Drug_Names", 'DRUG:{target}') YIELD node, score
    RETURN node.DRUG as Name, count(node.DRUG) as freq, score
    ORDER BY freq DESC
    LIMIT 10'''.format(target=target)
    data = session.run(query)
    return pd.DataFrame([dict(record) for record in data])

In [12]:
start_time = time.time()
result = search_drug_names('Levetiracetam~')
print("Total runtime:", time.time() - start_time, "seconds")
result

Total runtime: 0.23864102363586426 seconds


Unnamed: 0,Name,freq,score
0,LeVETiracetam,6746,3.40955
1,Levetiracetam,2031,3.40955
2,LeVETiracetam Oral Solution,234,2.196988
3,LeVETiracetam Solution,96,2.672145
4,*NF* Levetiracetam,10,2.672145
5,Levetiracetam Solution,3,2.672145
6,levetiracetam,1,3.40955


In [39]:
def prob_assoc_Rx(Rx_list):
    
    query = '''
    MATCH (rx1:Prescriptions)-[:CHILD_OF]->(pt:Patients)
    WHERE rx1.DRUG in {Rx_list}
    WITH distinct(pt) AS patients
    MATCH (patients)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(Rx_list=Rx_list)
    comorbidities = session.run(query)
    comorbidities = pd.DataFrame([dict(record) for record in comorbidities])
    
    query = '''
    MATCH (excluded:Prescriptions)
    WHERE excluded.DRUG in {Rx_list}
    WITH collect(excluded) as excluded
    MATCH (pt:Patients)<-[:CHILD_OF]-(rx:Prescriptions)
    WITH excluded, pt, collect(rx) as prescriptions
    WHERE NONE (rx in prescriptions where rx in excluded)
    MATCH (pt)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(Rx_list=Rx_list)
    gen_problems = session.run(query)
    gen_problems = pd.DataFrame([dict(record) for record in gen_problems])
    
    gen_pop_total = sum(gen_problems['Number'])
    gen_problems['Gen_pop_proportion'] = gen_problems['Number']/gen_pop_total
    
    gen_problems = gen_problems[gen_problems['Number'] > 50]
    
    comorb_total = sum(comorbidities['Number'])
    comorbidities['Comorbidities_proportion'] = comorbidities['Number']/comorb_total
    
    comorbidities = comorbidities[comorbidities['Number'] > 50]
    
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    comorbidities = pd.merge(comorbidities, gen_problems, on=['CUI', 'Description'])
    
    comorbidities['Odds_Ratio'] = (comorbidities['Comorbidities_proportion']/comorbidities['Gen_pop_proportion'])
    comorbidities.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
       
    return comorbidities.loc[:,['Description', 'Odds_Ratio']].head(20)

In [40]:
Rx_list = ['LeVETiracetam']
prob_assoc_Rx(Rx_list)

Unnamed: 0,Description,Odds_Ratio
90,Status Epilepticus,23.713763
49,Anoxic Encephalopathy,16.514395
135,Acute hepatitis,10.533267
3,Seizures,10.065789
25,Epilepsy,9.750796
69,"Encephalitis, St. Louis",9.142128
28,"Hematoma, Subdural",8.543922
137,Cerebral Edema,7.54344
152,"Purpura, Thrombotic Thrombocytopenic",7.368499
186,Thrombotic Microangiopathies,6.810465


## Input labs, output likely problems  

To-do:  
- Merge the LOINC codes and ITEMID numbers from the D_Labitems table with CUIs from UMLS concepts, using LOINC as primary key and LNC as foreign key
- 

In [55]:
# # Create a fulltext index for the string property of the nodes labeled String_UMLS
# command = '''CALL db.index.fulltext.createNodeIndex("Labs", ["D_Labitems"], ["LABEL"])'''
# session.run(command)

<neo4j.work.result.Result at 0x7f0639cdb1c0>

In [67]:
# Define a function that conducts a fuzzy fulltext search on an index of lab tests that have reference ranges

def string_umls_search_labs(target):
    query = '''
    CALL db.index.fulltext.queryNodes("Labs", 'LABEL:{target}') YIELD node, score
    WHERE EXISTS(node.ref_range_lower)
    RETURN node.LABEL as Label, node.FLUID AS Fluid, node.ITEMID as ITEMID, count(node) as Freq, score
    ORDER BY Freq DESC'''.format(target=target)
    data = session.run(query)
    return pd.DataFrame([dict(record) for record in data])

In [76]:
start_time = time.time()
result = string_umls_search_labs('Platelet')
print("Total runtime:", time.time() - start_time, "seconds")
result

Total runtime: 0.016495704650878906 seconds


Unnamed: 0,Label,Fluid,ITEMID,Freq,score
0,Platelet Count,Blood,51265,1,2.385857


In [None]:
abnl_labs_list = ['51265']

In [None]:
def prob_assoc_abnl_lab(abnl_labs_list):
    
    query = '''
    MATCH (lab:Labevents {{FLAG:'abnormal'}})-[:CHILD_OF]->(pt:Patients)
    WHERE lab.ITEMID in {abnl_labs_list}
    WITH distinct(pt) AS patients
    MATCH (patients)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(abnl_labs_list=abnl_labs_list)
    comorbidities = session.run(query)
    comorbidities = pd.DataFrame([dict(record) for record in comorbidities])
    
    query = '''
    MATCH (excluded:Prescriptions)
    WHERE excluded.DRUG in {Rx_list}
    WITH collect(excluded) as excluded
    MATCH (pt:Patients)<-[:CHILD_OF]-(rx:Prescriptions)
    WITH excluded, pt, collect(rx) as prescriptions
    WHERE NONE (rx in prescriptions where rx in excluded)
    MATCH (pt)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(Rx_list=Rx_list)
    gen_problems = session.run(query)
    gen_problems = pd.DataFrame([dict(record) for record in gen_problems])
    
    gen_pop_total = sum(gen_problems['Number'])
    gen_problems['Gen_pop_proportion'] = gen_problems['Number']/gen_pop_total
    
    gen_problems = gen_problems[gen_problems['Number'] > 50]
    
    comorb_total = sum(comorbidities['Number'])
    comorbidities['Comorbidities_proportion'] = comorbidities['Number']/comorb_total
    
    comorbidities = comorbidities[comorbidities['Number'] > 50]
    
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    comorbidities = pd.merge(comorbidities, gen_problems, on=['CUI', 'Description'])
    
    comorbidities['Odds_Ratio'] = (comorbidities['Comorbidities_proportion']/comorbidities['Gen_pop_proportion'])
    comorbidities.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
       
    return comorbidities.loc[:,['Description', 'Odds_Ratio']].head(20)

---
---  
# Input problem list to develop plans for each problem
---

## Input problem, output labs likely to be abnormal  

In [3]:
def prob_assoc_abnl_lab(cui_prob_list):
    
    query = '''
    MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WHERE prob1.cui in {cui_prob_list}
    WITH distinct(pt) AS patients
    MATCH (d:D_Labitems)<-[:CHILD_OF]-(n:Labevents)-[:CHILD_OF]->(patients)
    RETURN d.ITEMID AS ITEMID, d.LABEL as `Abnormal Lab`, d.FLUID as `Source`, COUNT(n.FLAG = 'abnormal') AS abnormal, COUNT(n) as total
    ORDER BY total DESC
    '''.format(cui_prob_list=cui_prob_list)
    with_prob_labs = session.run(query)
    with_prob_labs = pd.DataFrame([dict(record) for record in with_prob_labs])
    
    query = '''
    MATCH (excluded:Problem)
    WHERE excluded.cui in {cui_prob_list}
    WITH collect(excluded) as excluded
    MATCH (pt:Patients)-[:HAS_PROBLEM]->(prob:Problem)
    WITH excluded, pt, collect(prob) as problems
    WHERE NONE (prob in problems where prob in excluded)
    MATCH (d:D_Labitems)<-[:CHILD_OF]-(n:Labevents)-[:CHILD_OF]->(pt)
    RETURN d.ITEMID AS ITEMID, COUNT(n.FLAG = 'abnormal') AS abnormal, COUNT(n) as total
    ORDER BY total DESC
    '''.format(cui_prob_list=cui_prob_list)
    without_prob_labs = session.run(query)
    without_prob_labs = pd.DataFrame([dict(record) for record in without_prob_labs])
    
    without_prob_labs = without_prob_labs[without_prob_labs['abnormal'] > 10]
    without_prob_labs['without_prob_proportion_abnl'] = without_prob_labs['abnormal']/without_prob_labs['total']
    
    with_prob_labs = with_prob_labs[with_prob_labs['abnormal'] > 10]
    with_prob_labs['with_prob_proportion_abnl'] = with_prob_labs['abnormal']/with_prob_labs['total']
        
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    with_prob_labs = pd.merge(with_prob_labs, without_prob_labs, on=['ITEMID'])
    
    with_prob_labs['Odds_Ratio'] = (with_prob_labs['with_prob_proportion_abnl']/with_prob_labs['without_prob_proportion_abnl'])
    with_prob_labs.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
    return with_prob_labs.loc[:,['Abnormal Lab', 'Source', 'Odds_Ratio']].head(10)

In [6]:
start_time = time.time()

prob_CUI = ['C0023891']
likely_abnormal_labs = prob_assoc_abnl_lab(prob_CUI)

print("Total runtime:", time.time() - start_time, "seconds")
likely_abnormal_labs

Total runtime: 7.1864094734191895 seconds


Unnamed: 0,Abnormal Lab,Source,Odds_Ratio
12,Magnesium,Blood,3.240132
21,"Bilirubin, Total",Blood,2.982233
13,MCH,Blood,2.597554
10,Platelet Count,Blood,2.577995
0,Sodium,Blood,2.40863
11,RDW,Blood,1.973703
26,Calculated Total CO2,Blood,1.965494
20,Asparate Aminotransferase (AST),Blood,1.816216
18,PTT,Blood,1.806141
14,MCHC,Blood,1.782065


## Input problem, output likely prescriptions  
  
To-Do:
- allow user to specify whether the patient has renal or hepatic impairment

In [84]:
def Rx_assoc_with_prob_CUI(prob_CUI):
    
    # Format the input problem CUI as a list
    prob_CUI = [prob_CUI]
    
    query = '''
    MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WHERE prob1.cui in {prob_CUI}
    WITH distinct(pt) AS patients
    MATCH (patients)<-[:CHILD_OF]-(rx:Prescriptions)
    RETURN rx.DRUG AS Drug, count(rx.DRUG) as Number
    ORDER BY Number DESC
    '''.format(prob_CUI=prob_CUI)
    with_prob_Rx = session.run(query)
    with_prob_Rx = pd.DataFrame([dict(record) for record in with_prob_Rx])
    
    query = '''
    MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WHERE NOT prob1.cui in {prob_CUI}
    WITH distinct(pt) AS patients
    MATCH (patients)<-[:CHILD_OF]-(rx:Prescriptions)
    RETURN rx.DRUG AS Drug, count(rx.DRUG) as Number
    ORDER BY Number DESC
    '''.format(prob_CUI=prob_CUI)
    without_prob_Rx = session.run(query)
    without_prob_Rx = pd.DataFrame([dict(record) for record in without_prob_Rx])
       
    without_prob_total = sum(without_prob_Rx['Number'])
    without_prob_Rx['without_prob_proportion'] = without_prob_Rx['Number']/without_prob_total
    
    without_prob_Rx = without_prob_Rx[without_prob_Rx['Number'] > 30]
        
    with_prob_total = sum(with_prob_Rx['Number'])
    with_prob_Rx['with_prob_proportion'] = with_prob_Rx['Number']/with_prob_total
        
    with_prob_Rx = with_prob_Rx[with_prob_Rx['Number'] > 20]
    
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    with_prob_Rx = pd.merge(with_prob_Rx, without_prob_Rx, on=['Drug'])
    
    with_prob_Rx['Odds_Ratio'] = (with_prob_Rx['with_prob_proportion']/with_prob_Rx['without_prob_proportion'])
    with_prob_Rx.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
    return with_prob_Rx.loc[:,['Drug','Odds_Ratio']].head(10)
#     return with_prob_Rx.head(10)

In [85]:
start_time = time.time()

prob_CUI = 'C0348801'
likely_Rx = Rx_assoc_with_prob_CUI(prob_CUI)

print("Total runtime:", time.time() - start_time, "seconds")
likely_Rx

Total runtime: 6.605773448944092 seconds


Unnamed: 0,Drug,Odds_Ratio
9,Tacrolimus,5.272193
15,Lactulose,4.04634
13,0.9% Sodium Chloride (Mini Bag Plus),3.276101
24,MethylPREDNISolone Sodium Succ,3.169828
18,1/2 NS,3.122881
22,Pantoprazole Sodium,2.0107
19,Vial,1.59633
1,5% Dextrose,1.574014
5,D5W,1.423272
16,Heparin Sodium,1.370536


<a id='conditional_probability'></a>
### Using Conditional Probability instead of Odds Ratio

In [82]:
# Get the probability of each Problem in the general population
query = '''
MATCH (ptTotal:Patients)
WITH count(ptTotal) AS ptTotal
MATCH (probTotal:Patients)-[:HAD_PROBLEM]-(b:Problem)
WITH b.cui AS Problem_CUI, count(distinct(probTotal)) AS probTotal, ptTotal
WITH Problem_CUI, toFloat(probTotal)/ptTotal AS problem_gen_pop_probability
WHERE probTotal > 20
RETURN Problem_CUI, problem_gen_pop_probability
'''
data = session.run(query)
problem_gen_pop_probability = pd.DataFrame([dict(record) for record in data])

In [83]:
# Get the probability of each prescription in the general population
query = '''
MATCH (ptTotal:Patients)
WITH count(ptTotal) AS ptTotal
MATCH (RxTotal:Patients)-[:HAD]-(rx:Prescriptions)-[:INSTANCE_OF]->(b:Concept)
WITH b.cui AS Rx_CUI, count(distinct(RxTotal)) AS RxTotal, ptTotal
RETURN Rx_CUI, toFloat(RxTotal)/ptTotal AS RxProbability
ORDER BY RxProbability DESC
'''
data = session.run(query)
Rx_gen_pop_probability = pd.DataFrame([dict(record) for record in data])

In [84]:
# Get the probability of a specific pair of prescription and problem
query = '''
MATCH (ptTotal:Patients)
WITH count(ptTotal) AS ptTotal
MATCH (p:Problem)<-[:HAD_PROBLEM]-(RxProbTotal:Patients)-[:HAD]-(rx:Prescriptions)-[:INSTANCE_OF]->(c:Concept)
WITH p.cui AS Problem_CUI, c.cui AS Rx_CUI, count(distinct(RxProbTotal)) AS RxProbTotal, ptTotal
WHERE RxProbTotal > 10
RETURN Problem_CUI, Rx_CUI, toFloat(RxProbTotal)/ptTotal AS Rx_Problem_Probability
ORDER BY Rx_Problem_Probability DESC
'''
data = session.run(query)
Rx_Problem_Probability = pd.DataFrame([dict(record) for record in data])

In [85]:
problem_gen_pop_probability

Unnamed: 0,Problem_CUI,problem_gen_pop_probability
0,C1321898,0.000881
1,C0497131,0.000494
2,C0013404,0.002580
3,C0018932,0.001913
4,C0002962,0.000623
...,...,...
460,C0026771,0.001032
461,C1136033,0.000623
462,C0033036,0.000559
463,C0476474,0.000559


In [86]:
Rx_gen_pop_probability

Unnamed: 0,Rx_CUI,RxProbability
0,C4282051,0.451763
1,C4082777,0.432545
2,C4282061,0.384093
3,C4282050,0.376182
4,C4281944,0.349205
...,...,...
1552,C0715607,0.000021
1553,C2954915,0.000021
1554,C1814207,0.000021
1555,C0715226,0.000021


In [87]:
Rx_Problem_Probability

Unnamed: 0,Problem_CUI,Rx_CUI,Rx_Problem_Probability
0,C0020517,C0977439,0.003805
1,C0011860,C0977439,0.003783
2,C0039239,C0977439,0.003697
3,C0009676,C0977439,0.003525
4,C0020517,C4282061,0.003482
...,...,...,...
29068,C0037088,C2955100,0.000236
29069,C0037088,C0939651,0.000236
29070,C0009676,C0690082,0.000236
29071,C0034186,C2343627,0.000236


In [88]:
Rx_Problem_Probability_merged = pd.merge(Rx_Problem_Probability, problem_gen_pop_probability, on=['Problem_CUI'])
Rx_Problem_Probability_merged = pd.merge(Rx_Problem_Probability_merged, Rx_gen_pop_probability, on=['Rx_CUI'])
Rx_Problem_Probability_merged['co_occurrance_probability'] = Rx_Problem_Probability_merged.Rx_Problem_Probability / (Rx_Problem_Probability_merged.problem_gen_pop_probability + Rx_Problem_Probability_merged.RxProbability)
Rx_Problem_Probability_merged

Unnamed: 0,Problem_CUI,Rx_CUI,Rx_Problem_Probability,problem_gen_pop_probability,RxProbability,co_occurrance_probability
0,C0020517,C0977439,0.003805,0.004364,0.314230,0.011943
1,C0011860,C0977439,0.003783,0.004256,0.314230,0.011879
2,C0039239,C0977439,0.003697,0.003934,0.314230,0.011621
3,C0009676,C0977439,0.003525,0.004084,0.314230,0.011075
4,C0600688,C0977439,0.003418,0.003719,0.314230,0.010750
...,...,...,...,...,...,...
26866,C0746883,C3887762,0.000258,0.000924,0.005288,0.041522
26867,C0746883,C0708786,0.000258,0.000924,0.003805,0.054545
26868,C0267963,C3887542,0.000258,0.000817,0.025107,0.009950
26869,C0019557,C2241233,0.000301,0.000817,0.016294,0.017588


In [89]:
Rx_Problem_Probability_merged.sort_values(by='co_occurrance_probability', ascending=False, inplace=True)
Rx_Problem_Probability_merged

Unnamed: 0,Problem_CUI,Rx_CUI,Rx_Problem_Probability,problem_gen_pop_probability,RxProbability,co_occurrance_probability
26667,C0746883,C0687786,0.000430,0.000924,0.002042,0.144928
25509,C0021400,C0875805,0.001311,0.002816,0.006621,0.138952
26535,C0085605,C1584819,0.000946,0.002085,0.009652,0.080586
25447,C0022661,C1951501,0.000795,0.003740,0.008856,0.063140
26805,C0085605,C0690704,0.000451,0.002085,0.005267,0.061404
...,...,...,...,...,...,...
3693,C0232493,C4082777,0.000236,0.000516,0.432545,0.000546
3641,C0001418,C4082777,0.000236,0.000537,0.432545,0.000546
3594,C0085281,C4082777,0.000236,0.000559,0.432545,0.000546
3639,C0038663,C4082777,0.000236,0.000580,0.432545,0.000546


In [90]:
# Write out to CSV
Rx_Problem_Probability_merged.loc[:,['Problem_CUI', 'Rx_CUI', 'co_occurrance_probability']].to_csv('Rx_Problem_co_occurrance_probability_2021_12_06.csv', index=False)

Move the CSV into the database's Import folder

In [97]:
# Import the co-occurance probabilities into the database
timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M-%S")

command = '''
USING PERIODIC COMMIT 100000 LOAD CSV WITH HEADERS FROM "file:///Rx_Problem_co_occurrance_probability_2021_12_06.csv" AS COLUMN
MATCH (prob:Concept)
WHERE prob.cui = COLUMN.Problem_CUI AND prob.cui_pref_term IS NOT NULL
MATCH (rx:Concept)
WHERE rx.cui = COLUMN.Rx_CUI AND prob.cui_pref_term IS NOT NULL
CREATE (prob)<-[:OCCURS_WITH {{co_occurrance_probability:COLUMN.co_occurrance_probability, source:'MIMIC-III v1.4', updated:'{timestamp}'}}]-(rx)
'''.format(timestamp=timestamp)

session.run(command)

<neo4j.work.result.Result at 0x7f0c33e0ab80>

## (insufficient data) Input problem, output consults likely to be needed

In [23]:
# def consults_assoc_with_prob_CUI(prob_CUI):
    
#     # Format the input problem CUI as a list
#     prob_CUI = [prob_CUI]
    
#     query = '''
#     MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
#     WHERE prob1.cui in {prob_CUI}
#     WITH distinct(pt) AS patients
#     MATCH (patients)<-[:CHILD_OF]-(c:Cptevents)
#     WHERE c.SECTIONHEADER IN ["Evaluation and management","Medicine"]
#     RETURN c.SUBSECTIONHEADER as Consult, COUNT(c) as Number
#     ORDER BY Number DESC
#     '''.format(prob_CUI=prob_CUI)
#     with_prob_consults = session.run(query)
#     with_prob_consults = pd.DataFrame([dict(record) for record in with_prob_consults])
    
#     query = '''
#     MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
#     WHERE NOT prob1.cui in {prob_CUI}
#     WITH distinct(pt) AS patients
#     MATCH (patients)<-[:CHILD_OF]-(c:Cptevents)
#     WHERE c.SECTIONHEADER IN ["Evaluation and management","Medicine"]
#     RETURN c.SUBSECTIONHEADER as Consult, COUNT(c) as Number
#     ORDER BY Number DESC
#     '''.format(prob_CUI=prob_CUI)
#     without_prob_consults = session.run(query)
#     without_prob_consults = pd.DataFrame([dict(record) for record in without_prob_consults])
       
#     without_prob_total = sum(without_prob_consults['Number'])
#     without_prob_consults['without_prob_proportion'] = without_prob_consults['Number']/without_prob_total
    
#     without_prob_consults = without_prob_consults[without_prob_consults['Number'] > 5]
        
#     with_prob_total = sum(with_prob_consults['Number'])
#     with_prob_consults['with_prob_proportion'] = with_prob_consults['Number']/with_prob_total
        
#     with_prob_consults = with_prob_consults[with_prob_consults['Number'] > 5]
    
#     # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
#     with_prob_consults = pd.merge(with_prob_consults, without_prob_consults, on=['Consult'])
    
#     with_prob_consults['Odds_Ratio'] = (with_prob_consults['with_prob_proportion']/with_prob_consults['without_prob_proportion'])
#     with_prob_consults.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
#     return with_prob_consults.loc[:,['Consult','Odds_Ratio']].head(10)

In [25]:
# start_time = time.time()

# prob_CUI = 'C0036341'
# likely_consults = consults_assoc_with_prob_CUI(prob_CUI)

# print("Total runtime:", time.time() - start_time, "seconds")
# likely_consults

Total runtime: 0.56453537940979 seconds


Unnamed: 0,Consult,Number
0,Pulmonary,204
1,Critical care services,82
2,Hospital inpatient services,55
3,Consultations,8


## (insufficient data) Input problem, output likely imaging/procedures

In [55]:
# def Imaging_assoc_with_prob_CUI(prob_CUI):
    
#     # Format the input problem CUI as a list
#     prob_CUI = [prob_CUI]
    
#     query = '''
#     MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
#     WHERE prob1.cui in {prob_CUI}
#     WITH distinct(pt) AS patients
#     MATCH (n:Noteevents {{CATEGORY:'Radiology'}})-[:CHILD_OF]->(patients)
#     RETURN n.DESCRIPTION as `Imaging study`, count(n) AS Number
#     ORDER BY Number DESC
#     '''.format(prob_CUI=prob_CUI)
#     with_prob_imaging = session.run(query)
#     with_prob_imaging = pd.DataFrame([dict(record) for record in with_prob_imaging])
    
#     query = '''
#     MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
#     WHERE NOT prob1.cui in {prob_CUI}
#     WITH distinct(pt) AS patients
#     MATCH (n:Noteevents {{CATEGORY:'Radiology'}})-[:CHILD_OF]->(patients)
#     RETURN n.DESCRIPTION as `Imaging study`, count(n) AS Number
#     ORDER BY Number DESC
#     '''.format(prob_CUI=prob_CUI)
#     without_prob_imaging = session.run(query)
#     without_prob_imaging = pd.DataFrame([dict(record) for record in without_prob_imaging])
       
#     without_prob_imaging = without_prob_imaging[without_prob_imaging['Number'] > 10]
    
#     without_prob_total = sum(without_prob_imaging['Number'])
#     without_prob_imaging['without_prob_proportion'] = without_prob_imaging['Number']/without_prob_total
    
#     with_prob_imaging = with_prob_imaging[with_prob_imaging['Number'] > 20] 
    
#     with_prob_total = sum(with_prob_imaging['Number'])
#     with_prob_imaging['with_prob_proportion'] = with_prob_imaging['Number']/with_prob_total
    
#     # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
#     with_prob_imaging = pd.merge(with_prob_imaging, without_prob_imaging, on=['Imaging study'])
    
#     with_prob_imaging['Odds_Ratio'] = (with_prob_imaging['with_prob_proportion']/with_prob_imaging['without_prob_proportion'])
#     with_prob_imaging = with_prob_imaging[with_prob_imaging['Odds_Ratio'] > 2] 
#     with_prob_imaging.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
#     return with_prob_imaging.loc[:,['Imaging study', 'Odds_Ratio']].head(10)

In [56]:
# start_time = time.time()

# prob_CUI = 'C0024050'
# likely_imaging = Imaging_assoc_with_prob_CUI(prob_CUI)

# print("Total runtime:", time.time() - start_time, "seconds")
# likely_imaging

Total runtime: 0.5992767810821533 seconds


Unnamed: 0,Imaging study,Number
0,CHEST (PORTABLE AP),65
1,CHEST (PA & LAT),22
2,CT HEAD W/O CONTRAST,10
3,CHEST PORT. LINE PLACEMENT,10
4,PARACENTESIS DIAG. OR THERAPEUTIC,9
...,...,...
67,P BILAT LOWER EXT VEINS PORT,1
68,INTRO CATH SVC/IVC,1
69,FEE ADJUSTED IN SPECIFIC SITUATION,1
70,O CHEST (PORTABLE AP) IN O.R.,1


---
---
# Draft the Assessment and Plan
---

In [57]:
# print('Problem list summary\n\n', watch_out_for_these['Description'].head().to_string(index=False), '\n\n\n')

for problem in watch_out_for_these.head().iterrows():
    description = problem[1][0]
    CUI = problem[1][1]
    print(description, '\n')
    print(Rx_assoc_with_prob_CUI(CUI).to_string(index=False), '\n\n')

Spinal Fractures 

             Drug  Odds_Ratio
        Methadone    8.326655
Enoxaparin Sodium    5.389038
        Diltiazem    3.334854
      Haloperidol    3.163094
       PredniSONE    2.961187 


Acute toxic hepatitis 

                Drug  Odds_Ratio
0.9% Sodium Chloride    2.489616
                  LR    2.035663
         5% Dextrose    1.828281
Iso-Osmotic Dextrose    1.280549
             Insulin    1.125812 


Contusions 

           Drug  Odds_Ratio
   Thiamine HCl   26.719770
       Diazepam   18.963217
Multivitamin IV    8.474071
       Thiamine    6.154144
     FoLIC Acid    4.959846 


Acute hepatitis 

                Drug  Odds_Ratio
         5% Dextrose    1.946517
0.9% Sodium Chloride    1.932745
           Lorazepam    1.612829
   Calcium Gluconate    1.528417
                  LR    1.316507 


Liver Cirrhosis, Alcoholic 

                      Drug  Odds_Ratio
            Spironolactone   11.951182
                Tacrolimus    9.796584
                 Lactulo

In [None]:
## Return likely problem list based on prescribed medications and known problems on the problem list  
### Strategy 1: Use problem and prescription parameters in a combined query to define the patient population

In [4]:
def probs_likely_with(prob_list, Rx_list):
    
    query = '''
    MATCH (prob1:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WHERE prob1.description in {prob_list}
    WITH distinct(pt) AS patients
    MATCH (rx:Prescriptions)-[:CHILD_OF]->(patients)
    WHERE rx.DRUG in {Rx_list}
    WITH distinct(patients) AS patients
    MATCH (patients)-[:HAS_PROBLEM]->(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(prob_list=prob_list, Rx_list=Rx_list)
    comorbidities = session.run(query)
    comorbidities = pd.DataFrame([dict(record) for record in comorbidities])

    query = '''
    MATCH (excluded_rx:Prescriptions)
    WHERE excluded_rx.DRUG in ['Digoxin']
    WITH collect(excluded_rx) as excluded_rx

    MATCH (excluded_prob:Problem)
    WHERE excluded_prob.description in ['Heart failure']
    WITH collect(excluded_prob) as excluded_prob, excluded_rx

    MATCH (prob:Problem)<-[:HAS_PROBLEM]-(pt:Patients)
    WITH excluded_rx, excluded_prob, pt, collect(prob) as problems
    WHERE NONE (prob in problems where prob in excluded_prob)

    MATCH (pt)<-[:CHILD_OF]-(rx:Prescriptions)
    WITH excluded_rx, pt, collect(rx) as prescriptions
    WHERE NONE (rx in prescriptions where rx in excluded_rx)

    MATCH (pt)-[:HAS_PROBLEM]-(prob2:Problem)
    RETURN prob2.description as Description, prob2.cui AS CUI, count(prob2.description) as Number
    ORDER BY Number DESC
    '''.format(prob_list=prob_list, Rx_list=Rx_list)
    gen_problems = session.run(query)
    gen_problems = pd.DataFrame([dict(record) for record in gen_problems])
       
    gen_pop_total = sum(gen_problems['Number'])
    gen_problems['Gen_pop_proportion'] = gen_problems['Number']/gen_pop_total
    
    comorb_total = sum(comorbidities['Number'])
    comorbidities['Comorbidities_proportion'] = comorbidities['Number']/comorb_total
    
    input_parameter_count = len(prob_list)+len(Rx_list)
    comorbidities = comorbidities[comorbidities['Number'] > 50/input_parameter_count]
    
    # Merge the "Gen_pop_proportion" column from gen_problems into comorbidities
    comorbidities = pd.merge(comorbidities, gen_problems, on=['CUI', 'Description'])
    
    comorbidities['Odds_Ratio'] = (comorbidities['Comorbidities_proportion']/comorbidities['Gen_pop_proportion'])
    comorbidities.sort_values(by='Odds_Ratio', ascending=False, inplace=True)
    
#     return comorbidities.loc[:,['Description', 'Odds_Ratio']].head(20)
    return comorbidities.head(20)

In [15]:
prob_list = ['Diabetes Mellitus']
Rx_list = ['']

start_time = time.time()

probs_predicted_strategy_1 = probs_likely_with(prob_list, Rx_list)

print("Total runtime:", time.time() - start_time, "seconds")

probs_predicted_strategy_1

KeyError: 'Number'

### Strategy 2: Generate problem lists separately for comorbidities and medication-associated problems, then combine the lists

In [68]:
start_time = time.time()

problems_predicted_strategy_2 = pd.concat([comorbidities_of(prob_list), prob_assoc_Rx(Rx_list)])

print("Total runtime:", time.time() - start_time, "seconds")
problems_predicted_strategy_2.sort_values('Odds_Ratio', ascending=False).head(20)

Total runtime: 12.874289989471436 seconds


Unnamed: 0,Description,Odds_Ratio
65,Acute toxic hepatitis,66.102276
61,Spinal Fractures,51.071877
244,Abdominal Compartment Syndrome,46.37307
251,Fungemia,44.698952
64,Acute hepatitis,22.25893
71,"Liver Cirrhosis, Alcoholic",21.708253
35,"Alcoholic Intoxication, Chronic",19.289289
9,Alcohol withdrawal syndrome,13.774875
66,Pancreatitis Necrotizing,12.536638
300,Staphylococcal Infections,11.272395
