## OMOP Outpatient Recommender

In [None]:
##Setting up Google sdk environment
import os 
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/wui/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

##Setting up BQ API
from google.cloud import bigquery
client = bigquery.Client()
project_id = 'som-nero-phi-jonc101'
dataset_id = 'wui_omop_peds'

In [1]:
import datetime
import sys
import getopt
import operator
import math
import json
import random
import pdb
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from statistics import mean
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from dateutil.relativedelta import relativedelta

from IPython.display import clear_output;

 

%matplotlib inline

class Recommender:
    """
    Recommender works with OMOP-CDM data to predict outpatient specialty orders (lab, procedure, drugs)
    based on features from referring primary care visit (condition, drug, labs, phrases extract from notes)
    
    ***Inputs***
    
    1) Patient Cohort
    self.Cohort -> read a patient cohort table from Bigquery that contains the following columns:
    - person_id
    - primary care_visit ID
    - primary care visit DATETIME
    - specialty care visit ID
    - specialty care visit DATETIME
    
    2) Features
    self.features contains tuples of ('feature category','feature table')
    Feature tables contain features (condition, drug, measurement, procedure)
    associated with either primary care or specialty care visits in the patient cohort
    
    Pipeline Overview:
    1) preProcessing:
       - treat each feature occurrence (a row in the Bigquery table) as an instance 
       - each unique feature is a clinical item
       - self.items is a dictionary to store all instances associated with a clinical item
       - self.feature_dict is a dictionary to store the OMOP concept ID of a clinical item by its category 
       a) label clinical items to filter such as vital signs, vaccine that was on a filter list (bq table)
       b) label rare items if instances are too few
       c) categorize labs as normal or abnormal based on reference range values
       d) add demograph data as items (for age: three categories -> infant age 0-2, child 2-12, teen 12-18)
    
    2) build patient items timeline (buildPatientItemPrePost):
       - two ways to place clinical items 
       a) associted with visit ID (primary care visit as Pre -> special care visit as Post)
       b) use primary care visit DATETIME as index time, look back a window period as Pre, 
          look forward a window period as Post
       c) filter previously labelled items (e.g. rare or on filter list)
       
    3) training:
        a) import training patients 
        b) for each patient, going through items based on the above pre-post timeline
        c) build co-occurence matrix based item-item association 
           - could count distinct item per patient or repeat items
        
    4) testing:
        a) import testing patients
        b) use query items to generate metrics from the above co-occurence table
           - PPV
        c) rank candidate items based on metric score
        
    5) evaluation:
        a) get precision and recall at k recommendation 
        b) get AUROC 
        
    """    
    def __init__(self):
        
        self.k = 5
        self.nlpRR = 1
        self.nlpNeglogP = 2
        self.itemRR = 1
        self.itemNeglogP = 2
        self.inspectPredictedItems = False
        self.isFilterNLP = False
        self.isPlotRankedScore = False
        
        self.inputFeatures = {'condition','drug','measurement','procedure'}
        self.outputFeatures = {'measurement','procedure'}
        
        self.project_id = 'som-nero-phi-jonc101'
        self.dataset_id = 'wui_omop_peds'
        self.prefix = "V2_test_"
        self.cohort_file = self.prefix + "cohort"
        self.demographic_file = self.prefix + "demographic"
    
        self.feature_dict = {"age":{},
                            "gender":{},
                            "race":{},
                            "condition":{},
                            "drug":{},
                            "measurement":{},
                            "derived_measurement":{},
                            "procedure":{},
                            "nlp":{}
                             }
        
        with open('sameItemMap.json', 'r') as fp:
            self.sameItemMap = json.load(fp)
        
        # load baseline prevalence metrics for items phrases
        with open('item_prevalence_map.json', 'r') as fp:
            self.item_prevalence_map = json.load(fp)
            
        
        self.nlp_prevalence_map = {}
        self.nlpPC_notfound = 0
        self.nlpSC_notfound = 0
        self.nlpPC_found = 0
        self.nlpSC_found = 0
        
        prevMap = {'PrimaryCare':'PC',
                   'SpecialtyCare':'SC',
                   'Cohort_PrimaryCare':'CohortPC',
                   'Cohort_SpecialtyCare':'CohortSC',
                   'All':'All'}
        
        for p in prevMap:
            with open('nlp_prevMap_' + prevMap[p] +'.json','r') as fp:
                self.nlp_prevalence_map[p] = json.load(fp)

        
        
        categoryList = ['condition','drug','measurement','procedure']
        suffixList = ['CohortPC','CohortSC']
        self.rrMap = {}
        for c in categoryList:
            for suffix in suffixList:
                with open(c + '_rrMap_' + suffix +'.json','r') as fp:
                    self.rrMap[(c,suffix)] = json.load(fp)
        
        
     
        self.patients = {}
        self.trainPatients = []
        self.items = {}
        self.trainingItems = {}
        self.trainingInputItems = set()
        self.candidateItems = set()
        self.coMatrix = []
        self.coMatrix_RepeatItems = []
        self.instanceDF = pd.DataFrame()
        self.instance_count = 0
        self.item_count = 0
        self.invalid_AUC_count = 0
        self.valid_AUC_count = 0        

    def readBQFile(self, tableName):
        # reading a table from BiqQuery 
        sql = """ 
            SELECT * FROM 
                `{project_id}.{dataset_id}.{table_id}`
            """.format_map(
                {'project_id':self.project_id, 
                 'dataset_id':self.dataset_id, 
                 'table_id':tableName})
        query_job = client.query(sql)
        dataFrame = query_job.to_dataframe()
        return dataFrame
    
    def preProcessing(self, count_cutoff):
        print("preprocessing...")
        
        # load the person table with demographic data
        self.demographic = self.readBQFile(self.demographic_file)
        self.addDemographicToPatient(self.demographic)
        
        # load the patient cohort and add pre/post visit 
        self.cohort = self.readBQFile(self.cohort_file)
        self.addIndexTimeToPatient(self.cohort)
        
        self.addAgeToPatient()
                
        instances = []
        for feature_category in self.inputFeatures:
            # feature including drug, condition, measurement, procedure etc
            feature_tablename = self.prefix + feature_category
            featureDF = self.readBQFile(feature_tablename)
            
            # looping through each row of the table 
            for index, row in featureDF.iterrows():
                
                person_id = row["person_id"]
                visit_id = row["visit_id"]
                item_datetime = row[feature_category + "_DATETIME"]
                
                # use source value if concept ID = 0
                item_concept_id, item_name = self.redefineConceptID(row, feature_category)
                
                # assign item value for lab as "normal" vs "abnormal" if value and ref range exists
                item_value = self.assignItemValue(row, feature_category)
                
                # generate a unique item code for each unique clinical item
                item_code = self.addItem(feature_category, item_concept_id, item_name, person_id)
                
                if item_value is not None:
                    derived_category = "derived_" + feature_category 
                    derived_id = item_value + "_" + item_concept_id
                    derived_name = item_value + " " + item_name
                    item_code_derived = self.addItem(derived_category, derived_id, derived_name, person_id)
                else:
                    item_code_derived = item_code
                
                # add instance to patient 
                self.patients[person_id]["instances"].append(self.instance_count)
                
                # instances table store each instance with corresponding item code, person_id, visit_id, time
                instances.append([self.instance_count, item_code, item_code_derived, 
                                  person_id, visit_id, item_datetime, item_value]) 
                self.instance_count += 1
        
        # instances is a list of lists --> convert to dataframe        
        self.instanceDF = pd.DataFrame.from_records(
                                        instances, columns = [
                                            "instance","item","item_derived","person_id",
                                            "visit_id","item_datetime","item_value"] )
        
        
        # label rare items in the item dictionary 
        self.labelRareItems(count_cutoff)
        
        # label filter items found in the filter list
        self.labelFilterItems()
        
        self.getItemPrevalence()
    
        
    def assignItemValue(self, featureRow, featureCategory):
        
        def isValid(ref_h, ref_l, v):
            if ref_h and ref_l and v and not (math.isnan(ref_h) or math.isnan(ref_l) or math.isnan(v)):
                return True
            else:
                return False
        
        if featureCategory == "measurement":
            refRange_high = featureRow["range_high"]
            refRange_low = featureRow["range_low"]
            value = featureRow["value_as_number"]
            if isValid(refRange_high, refRange_low, value):
                
#                 print("concept ID: {}".format(featureRow[featureCategory + "_concept_id"]))
#                 print("item name: {}".format(featureRow["concept_name"]))
#                 print("value: {}".format(value))
#                 print("ref high: {}".format(refRange_high))
#                 print("ref low: {}".format(refRange_low))
#                 print("---------------------------------------")
                
                if value >= refRange_low and value <= refRange_high:
                    return "normal"
                elif value > refRange_high:
                    return "high"
                elif value < refRange_low:
                    return "low"
            else:
                return None
        else:
            return None
    
    def redefineConceptID(self, featureRow, featureCategory):
        """for items with no matching concept, use source concept id as concept id
        if no source concept id available, use source_value"""
        
        item_concept_id = str(featureRow[featureCategory + "_concept_id"])
        item_name = featureRow["concept_name"]
        
#         if item_concept_id in self.sameItemMap.keys():
#             item_name = self.sameItemMap[item_concept_id][1]
#             item_concept_id = self.sameItemMap[item_concept_id][0]
           
        return item_concept_id, item_name
        
    def addItem(self, category, ID, item_name, person_id):
        """ 
            self.items serve as an inventory of clinical items
            self.items also store the instances that happened in each item 
            self.feature_dict is like a feature catolog storing the clinical item name
            and links to self.items by item_code 
            ID is OMOP concept ID or ID from function redefineConceptID
        """    
        if isinstance(ID, int):
            ID = str(ID)
            
        # check if a clinical item exists:
        # else establish a new clinical item in self.items and self.feature_dict    
        
        if ID in self.feature_dict[category]:            
            item_code = self.feature_dict[category][ID][0]
            self.items[item_code]["instances"].append(self.instance_count)
            self.items[item_code]["patients"].add(person_id)
        else:
            item_code = self.item_count
            self.feature_dict[category][ID] = (item_code, item_name)
            self.items[item_code] = {"category": category,
                                      "ID": ID,
                                     "patients":set([person_id]),
                                     "instances":[self.instance_count],
                                     "rare": False,
                                     "filter": False}
            
            if category not in ['gender','race','age','nlp','derived_measurement'] and ID != '0':
                self.items[item_code]["baselinePrevalence"] = self.item_prevalence_map[ID]
            elif category == 'nlp': 
                self.items[item_code]["baselinePrevalence"] = self.nlp_prevalence_map['All'][ID]
                
            self.item_count += 1
        
        if category in ['gender','race','age']:
            self.items[item_code]["instances"] = []
        
        return item_code
    
       
    def addIndexTimeToPatient(self, cohortDF):
        for index, row in cohortDF.iterrows():
            person_id = row["person_id"]
            if  "IndexTime_Exist" not in self.patients[person_id].keys():
                self.patients[person_id]["primaryVisitID"] = row["PrimaryCare_visit_id"]
                self.patients[person_id]["specialtyVisitID"] = row["Specialty_visit_id"]
                self.patients[person_id]["primaryVisitTime"] = row["PrimaryCare_DATETIME"]
                self.patients[person_id]["specialtyVisitTime"] = row["Specialty_DATETIME"]
                self.patients[person_id]["IndexTime_Exist"] = True
                self.patients[person_id]["instances"] = []
    
    def addDemographicToPatient(self, demographicDF):
        for index, row in demographicDF.iterrows():
            person_id = row["person_id"]
            if person_id not in self.patients:
                gender_item_code = self.addItem('gender', row["gender_concept_id"], row["gender"],person_id)
                race_item_code = self.addItem('race', row["race_concept_id"], row["race"],person_id)
                self.patients[person_id] = {"birthdate": row["birth_DATETIME"],
                                            "gender": row["gender"],
                                            "gender_item_code": gender_item_code,
                                            "race": row["race"],
                                            "race_item_code": race_item_code,
                                            }
    
    def addAgeToPatient(self):
     
        for person_id in self.patients:
            birthdate = self.patients[person_id]["birthdate"] 
            primaryvisitdate = self.patients[person_id]["primaryVisitTime"]
            age = relativedelta(primaryvisitdate, birthdate).years
            self.patients[person_id]["age"] = age
            if age < 2:
                infant_item_code = self.addItem('age','infant','infant', person_id)
                self.patients[person_id]['age_item_code'] = infant_item_code
            elif age >= 2 and age <12:
                child_item_code = self.addItem('age','child','child', person_id)
                self.patients[person_id]['age_item_code'] = child_item_code                
            else:
                teen_item_code = self.addItem('age','teen','teen', person_id)
                self.patients[person_id]['age_item_code'] = teen_item_code
    
    def labelRareItems(self, count_cutoff):

        for i in self.items:
            if len(self.items[i]["patients"]) < count_cutoff:
                   self.items[i]["rare"] = True
            else:
                   self.items[i]["rare"] = False
            if self.items[i]["category"] in ["gender","race","age"]:
                self.items[i]["rare"] = False
    
    def labelFilterItems(self):
        filterItemList = list(self.readBQFile("concepts_tofilter")["concept_id"])
        filterList = list(map(str, filterItemList))
        filterList.append('0')
        
        for i in self.items:
            
            item_conceptID = self.items[i]["ID"].replace("high_","").replace("low_","").replace("normal_","")
            
            if self.items[i]["category"] not in ["gender","race","age"]:
                
                assert item_conceptID.isdigit(), 'concept ID %r is not all digit' % item_conceptID

                if item_conceptID in filterList:
                    self.items[i]["filter"] = True
                else: 
                    self.items[i]["filter"] = False
                    
            elif item_conceptID == '0':
                self.items[i]["filter"] = True
    
    def getItemPrevalence(self):
        N = len(self.patients)
        for i in self.items:
            self.items[i]["prevalence"] = len(self.items[i]["patients"])*100/N        
 
    
    def buildPatientItemPrePost(self, person_id, useVisitID):
        
        def setPrePostThreshold(person_id):
            """default is to use datetime to set a threshold (Index time) to determine item-item co-occurrence  
            otherwise we can use primary care/specialty care visit ID to determine item-item co-occurrence 
            """
            if useVisitID:
                preThreshold = self.patients[person_id]["primaryVisitID"]
                postThreshold = self.patients[person_id]["specialtyVisitID"]
            else: 
                preThreshold = self.patients[person_id]["primaryVisitTime"]
                postThreshold = self.patients[person_id]["specialtyVisitID"]
                #postThreshold = self.patients[person_id]["specialtyVisitTime"]
            return preThreshold, postThreshold
        
        def groupItems(i):
            # group same items using sameItemMap, i is the item ID from instance 
            item_concept_id = self.items[i]['ID']
            item_category = self.items[i]['category']
            if (item_concept_id in self.sameItemMap.keys()) & (item_category not in {'nlp'}):
                new_item_concept_id = self.sameItemMap[item_concept_id][0]
                itemID = self.feature_dict[item_category][new_item_concept_id][0]
            else:
                itemID = i
            return itemID
        
        def addItemPre(instance):
            if instance["item_value"] is not None:
                itemsPre.append(instance["item_derived"])
            else:
                itemsPre.append(instance["item"])
            self.patients[person_id]["instancePre"].append(instance['instance'])
            
        def addItemPost(instance):
            i = instance["item"]
            itemID = groupItems(i)
            itemsPost.append(itemID)
            self.patients[person_id]["instancePost"].append(instance['instance'])
            
        def addDemographicItem(person_id):
            itemsPre.append(self.patients[person_id]["gender_item_code"])
            itemsPre.append(self.patients[person_id]["race_item_code"])
            itemsPre.append(self.patients[person_id]["age_item_code"])
        
        def sendItemPrePost(instance, preThreshold, postThreshold):
            if useVisitID == False:
                # collect items in pre-post threshold lists
                lookbackWindow = datetime.timedelta(days = 180)
                lookforwardWindow = datetime.timedelta(days = 1)
                if (instance.item_datetime >= preThreshold - lookbackWindow) & (instance.item_datetime < preThreshold + lookforwardWindow):
                        addItemPre(instance)
                elif instance.visit_id == postThreshold:
                    addItemPost(instance)
#                 elif (instance.item_datetime >= postThreshold - window) & (instance.item_datetime < postThreshold + window):
#                         addItemPost(instance)
            else:
                if instance.visit_id == preThreshold:
                    addItemPre(instance)
                elif instance.visit_id == postThreshold:
                    addItemPost(instance)
                       
        "main part of buildPatientItemPrePost"            
        preThreshold, postThreshold = setPrePostThreshold(person_id)
        
        itemsPre = []
        itemsPost = []
        
        addDemographicItem(person_id)
        self.patients[person_id]["instancePre"] = []
        self.patients[person_id]["instancePost"] = []
        
        for i in self.patients[person_id]["instances"]:
   
            # instance is a row of the instance dataFrame
            instance = self.instanceDF.iloc[i]
            
            
            # check if instance should be removed based on the items
            sendItemPrePost(instance, preThreshold, postThreshold)
        
        return itemsPre, itemsPost

    
    """functions use for training """
    def processItems(self, items, mode):
        
        def filterItems(items):
            newItems = []
            for i in items:
                isRareItem = self.items[i]["rare"]
                isFilterItem = self.items[i]["filter"]
                if not (isRareItem | isFilterItem):
                     newItems.append(i)
            return newItems 
        
        def excludeOutputCategory(items):        
        
            exclude_category = self.inputFeatures.difference(self.outputFeatures)
            items_after_exclusion = []
            for i in items:
                if self.items[i]["category"] not in exclude_category:
                    items_after_exclusion.append(i)
            return items_after_exclusion
                   
        
        def filterNLP(items, mode):
            items_after_filter = []
            for i in items:
                if self.items[i]['category'] == 'nlp' and mode == 'input':
                    neglogP, RR = self.getRR_Item(i, mode)                     
                    if RR >= self.nlpRR and neglogP >= self.nlpNeglogP:
                        items_after_filter.append(i)
                else:
                    items_after_filter.append(i)
            return items_after_filter
        
        '''main part of processItems'''
        ItemsFiltered = filterItems(items)
        if self.isFilterNLP:
            ItemsProcessed = filterNLP(ItemsFiltered, mode)
        else:
            ItemsProcessed = ItemsFiltered
        
        if mode == "output":
            if ItemsProcessed:
                ItemsProcessed = excludeOutputCategory(ItemsProcessed)
        
        return ItemsProcessed 
    
    def getRR_Item(self, item, mode):
        if mode == 'input':
            tag = 'CohortPC'
        elif mode == 'output':
            tag = 'CohortSC'
        ID = self.items[item]['ID']
        ID = re.compile(r'\d+').search(ID).group(0)
        category = self.items[item]["category"]
        category = str.replace(category,'derived_','')
        
        if ID in self.rrMap[(category, tag)]:
            neglogP, RR = self.rrMap[(category, tag)][ID]
            return neglogP, RR
        else:
            print(ID, ' not found in rrMap ', category, 'with tag: ', tag)
            return 0,0
     
    def buildCoMatrix(self, itemsPre, itemsPost):        
        # allow repeat item counts
        if itemsPre and itemsPost:
            for i in itemsPre:
                for j in itemsPost:
                    self.coMatrix_RepeatItems[i][j] += 1
                
            itemsPre = set(itemsPre)
            itemsPost = set(itemsPost)
         
            # each item only count once 
            for i in itemsPre:
                for j in itemsPost:
                    self.coMatrix[i][j] += 1 

    
    def buildTrainingItems(self, itemsPre, itemsPost, person_id):
        if itemsPre or itemsPost:
            for i in itemsPre + itemsPost:
                if i in self.trainingItems:
                    self.trainingItems[i]["instance_count"] += 1
                    self.trainingItems[i]["patients"].add(person_id)
                else: 
                    self.trainingItems[i] = {"instance_count": 1,
                                             "patients":{person_id}}
            # record the training input items
            self.trainingInputItems = self.trainingInputItems.union(itemsPre)
            self.candidateItems = self.candidateItems.union(itemsPost)
        
    
    def buildMetricsTrainingItems(self, trainPatients):
        N = len(trainPatients)
        for i in self.trainingItems.keys():
            self.trainingItems[i]["baselinefreq"] = self.trainingItems[i]["instance_count"]/N
            self.trainingItems[i]["prevalence"] = len(self.trainingItems[i]["patients"])/N
            
            
    def training(self, trainPatients, useVisitID = False):
        print("building co-occurrence matrix...")
        self.trainPatients = trainPatients
        self.coMatrix = np.zeros((self.item_count,self.item_count))
        self.coMatrix_RepeatItems = np.zeros((self.item_count,self.item_count))
        self.candidateItems = set()
        
        for person_id in trainPatients:
            itemsPre, itemsPost = self.buildPatientItemPrePost(person_id, useVisitID)
            itemsPre = self.processItems(itemsPre, mode = "input")
            itemsPost = self.processItems(itemsPost, mode = "output")
            self.buildCoMatrix(itemsPre, itemsPost)
            self.buildTrainingItems(itemsPre, itemsPost, person_id)
            self.patients[person_id]["itemsPre"] = itemsPre
            self.patients[person_id]["itemsPost"] = itemsPost
        self.buildMetricsTrainingItems(trainPatients)
        
    def testing(self, testPatients, method, k, useVisitID = False, returnMean = True):
        
        def printItems():
            print(person_id)
            print("------------------------------------------------------")
            print("Input Items: ", list(map(self.findItemName, inputItems)))
            print("------------------------------------------------------")
            print("Predicted Items: ", list(map(self.findItemName, rankedItems[:k])))
            print("------------------------------------------------------")
            print("Actual Items: ", list(map(self.findItemName, outputItems)))
            print("------------------------------------------------------") 
        
        precision_list = []
        recall_list = []
        auc_list = []
        valid_test = 0
        print("testing method: {}".format(method))
        
        for person_id in testPatients:
            itemsPre, itemsPost = self.buildPatientItemPrePost(person_id, useVisitID)
            itemsPre = self.processItems(itemsPre, mode = "input")
            itemsPost = self.processItems(itemsPost, mode = "output")
            
            if itemsPre and itemsPost:
                inputItems = list(set(itemsPre))
                outputItems = list(set(itemsPost))
     
                self.patients[person_id]["itemsPre"] = itemsPre
                self.patients[person_id]["itemsPost"] = itemsPost
            
                if len(outputItems) != 0 and len(inputItems)!=0:
                    rankedItems, rankedScore = self.rankingItems(inputItems, method)

                    precision, recall = self.evaluate(rankedItems, inputItems, outputItems, k)
                    if self.inspectPredictedItems:
                        if precision >= 0.5 and recall >= 0.5:
                            printItems()
                    auc = self.getAUC(rankedItems, rankedScore, outputItems)
                    
                    if self.isPlotRankedScore:
                        self.plotRankedScore(rankedScore, outputItems, precision, recall)

                    precision_list.append(precision)
                    recall_list.append(recall)
                    auc_list.append(auc)
                    valid_test += 1
        if valid_test > 0:
            if returnMean:
                return mean(precision_list), mean(recall_list), mean(auc_list)
            else:
                return precision_list, recall_list, auc_list
        else:
            pass

    def rankingItems(self, inputItems, method):
        # possible candidate Items, not including the query inputItems 
        #candidateItems = np.array(list(self.candidateItems.difference(inputItems)))
        candidateItems = np.array(list(self.candidateItems))
        queryItems = np.array(list(self.trainingInputItems.intersection(inputItems)))

        score = self.aggStats(candidateItems, queryItems, method)
        # sort based on rankedScore
        sortedIndex = score.argsort()[::-1]
        rankedItems = candidateItems[sortedIndex].tolist()
        rankedScore = score[sortedIndex].tolist()
        return rankedItems, rankedScore
    
    def plotRankedScore(self, rankedScore, outputItems, precision, recall):
        x = np.arange(0,len(rankedScore))
        y = rankedScore
        l = len(outputItems)
        precision = round(precision,2)
        recall = round(recall,2)
        plt.plot(x,y,linewidth = 2, color = 'red')
        plt.axvline(x = l)
        plt.title('precision = {p} ; recall = {r}; output = {l}'.format_map({'p':precision, 'r':recall, 'l':l}))
        plt.show()
    
    def aggStats(self, candidateItems, queryItems, method):
        
        # this apply statMetrics over queryItems
        getStats = np.vectorize(self.statMetrics) 
                
        def PPV(item):
            sumStats = sum(getStats(item, queryItems, 'ppv'))
            sumN = sum(getStats(item, queryItems, 'numA_pts'))
            return sumStats/sumN
        
        def PPV_mod(item):
            sumStats = sum(getStats(item, queryItems, 'ppv_mod'))
            sumN = sum(getStats(item, queryItems, 'numA_pts'))
            return sumStats/sumN
        
        def PPV_mod2(item):
            sumStats = sum(getStats(item, queryItems, 'ppv_mod2'))
            sumN = sum(getStats(item, queryItems, 'numA_pts'))
            return sumStats/sumN
        
        def PPV_wt(item):
            sumStats = sum(getStats(item, queryItems, 'ppv_wt'))
            sumN = sum(getStats(item, queryItems, 'numA_pts'))
            return sumStats/sumN
        
        def PPV_mod_wt(item):
            sumStats = sum(getStats(item, queryItems, 'ppv_mod_wt'))
            sumN = sum(getStats(item, queryItems, 'numA_pts'))
            return sumStats/sumN
        
        def PPV_mod2_wt(item):
            sumStats = sum(getStats(item, queryItems, 'ppv_mod2_wt'))
            sumN = sum(getStats(item, queryItems, 'numA_pts'))
            return sumStats/sumN
        
        def RR(item):
            prodStats = np.prod(getStats(item, queryItems, 'rr'))
            return prodStats
        
        def Fisher(item):
            sumStats = sum(getStats(item, queryItems, 'fisher_neglog'))
            return sumStats 
        
        def Prevalence(item):
            #return self.trainingItems[item]["prevalence"]
            return self.items[item]["prevalence"]
        
        def BaselinePrevalence(item):
            return self.items[item]["baselinePrevalence"]
        
        def Random(item):
            return np.random.random_sample()
                      
       
        switcher = {                   
                    'PPV':PPV,
                    'PPV_WT':PPV_wt,
                    'PPV_MOD':PPV_mod,
                    'PPV_MOD_WT':PPV_mod_wt,
                    'PPV_MOD2':PPV_mod2,
                    'PPV_MOD2_WT':PPV_mod2_wt,
                    'RR':RR,
                    'FISHER': Fisher,
                    'PREVALENCE':Prevalence,
                    'BASELINEPREVALENCE':BaselinePrevalence,
                    'RANDOM':Random
                    }
    
        applyMethod = switcher.get(method.upper())          
        score = np.vectorize(applyMethod)(candidateItems)
        return score       
        
    def statMetrics(self, itemB, itemA, metric):
        # item B is the candidate item
        # item A is the query item
        AB_pts = self.coMatrix[itemA][itemB]
        A_pts = len(self.trainingItems[itemA]["patients"])
        B_pts = len(self.trainingItems[itemB]["patients"])

        N = len(self.trainPatients)
        contingencyTable = [[AB_pts, A_pts - AB_pts],
              [B_pts - AB_pts, N - A_pts - B_pts + AB_pts]]
        
        def modifier(itemA, itemB):
            modifierA = 1;
            modifierB = 1;
            itemA_category = self.items[itemA]['category']
            itemA_category = str.replace(itemA_category,'derived_','')
            itemB_category = self.items[itemB]['category']
            itemB_category = str.replace(itemB_category,'derived_','')
            
            if itemA_category in self.inputFeatures:
                neglogP, rr = self.getRR_Item(itemA, mode = 'input')
                modifierA = rr
            return modifierA, modifierB
        
        modifierA,modifierB = modifier(itemA, itemB)
        
        def numA_pts():
            return A_pts
        def prevalence():
            return B_pts/N
        def PPV():
            return AB_pts
        def PPV_wt():
            return AB_pts/A_pts
        def PPV_mod():
            return modifierA * AB_pts 
        def PPV_mod_wt():
            return (modifierA * AB_pts) / A_pts
        def PPV_mod2():
            return (modifierA**2) * AB_pts
        def PPV_mod2_wt():
            return ((modifierA**2) * AB_pts) / A_pts 
        def RR():
            if B_pts == AB_pts:
                adj = 1
            else:
                adj = 0
            return (AB_pts/A_pts) / ((B_pts - AB_pts + adj)/(N - A_pts + adj))
        
        def Fisher_NegLog():
            try:
                (oddsRatio, fisherP) = stats.fisher_exact(contingencyTable)
                logP = -sys.float_info.max
                if fisherP > 0.0:
                    logP = math.log(fisherP,10)

                if oddsRatio > 1.0:
                    return -logP
                else:
                    return logP
            except ValueError as exc:
                # Likely from negative table values.  Return default / uncertain value
                return 0.0
   
        switcher = {
                'numA_pts': numA_pts,
                'prevalence': prevalence,
                'ppv': PPV,
                'ppv_wt':PPV_wt,
                'ppv_mod': PPV_mod,
                'ppv_mod_wt':PPV_mod_wt,
                'ppv_mod2': PPV_mod2,
                'ppv_mod2_wt': PPV_mod2_wt,
                'rr': RR,
                'fisher_neglog': Fisher_NegLog
                }
    
        func = switcher.get(metric, lambda: 'invalid')    
        
        return func()

    def evaluate(self, rankedItems, inputItems, outputItems, k):
        # return precision and recall
        classPos = len(outputItems)
        classNeg = len(rankedItems) - len(outputItems) 
        predictPos = k 
        predictNeg = len(rankedItems) - k
        TruePos = 0
        FalsePos = 0
        
        for item in rankedItems[:k]:
            if item in outputItems:        
                TruePos += 1 
            else:
                FalsePos += 1
            
        FalseNeg = classPos - TruePos
        TrueNeg = classNeg - FalsePos
        
        precision = TruePos / predictPos 
        recall = TruePos / classPos
        
        return precision, recall
        
    def getAUC(self, rankedItems, rankedScore, outputItems):
        addLabel = lambda x: 1 if x in outputItems else 0
        itemLabel = list(map(addLabel, rankedItems))
        if sum(itemLabel) > 0:
            auc = roc_auc_score(itemLabel, rankedScore)
            self.valid_AUC_count += 1
        else:
            auc = 0.5
            self.invalid_AUC_count += 1
        return auc
    
    def findItemName(self, itemID, nameOnly = True):
        category = self.items[itemID]["category"]
        conceptID = self.items[itemID]["ID"]
        # itemCode and itemID should be the same
        (itemCode, itemName) = self.feature_dict[category][conceptID]
        if nameOnly:
            return itemName
        else:
            return (conceptID, itemName)
        

# def main():

# if __name__== "__main__":
#     main()

In [None]:
"""show patient instances at primary care or specialty visits """
def showInstancePerPatient(testR, person_id, instanceType = 'Pre'):
    if instanceType == 'Pre':
        instanceList = testR.patients[person_id]['instancePre']
    else:
        instanceList = testR.patients[person_id]['instancePost']
             
    DF = testR.instanceDF.loc[instanceList, ['item','item_derived','item_datetime','visit_id','item_value']]
    DF['name'] = DF['item'].apply(testR.findItemName)
    DF['category'] = DF['item'].apply(lambda x:testR.items[x]['category'])
    DF['filter'] = DF['item'].apply(lambda x:testR.items[x]['filter'])
    DF['rare'] = DF['item'].apply(lambda x:testR.items[x]['rare'])
    DF['date'] = DF['item_datetime'].apply(datetime.datetime.date) 
    DF = DF.sort_values(by=['item','item_value'])
    DF = DF.drop_duplicates(subset=['item'],keep='first')
    DF = DF[(DF['filter'] == False) & (DF['rare'] == False)]
    DF = DF.sort_values(by=['item'])
    DF = DF[['date','name','item_value','category']]
    return DF

# DF = showInstancePerPatient(testR2, 31552010, 'Pre')
# DF[DF['category'] == 'measurement']

In [None]:
# get a list of top RR in training input items:


def showItemRR(testR):
    item_list = []
    for i in testR.trainingInputItems:
        if testR.items[i]['category'] in testR.inputFeatures:
            neglogP, RR = testR.getRR_Item(i, mode = 'input')
            item_list.append([i, testR.findItemName(i), RR, neglogP])
    df = pd.DataFrame(item_list)
    df.columns = ["item",'name','RR','neglogP']
    df = df.sort_values(by = ['RR'], ascending = False)
    return df

df = showItemRR(testR2)
df

In [None]:
"""main functions"""
def trainSplit(patient_dict, train_pct = 0.8): 
    personIDs = list(patient_dict.keys())
    trainSize = round(len(personIDs) * train_pct)
    shuffleIDs = random.sample(personIDs, len(personIDs))
    trainPatients = shuffleIDs[:trainSize]
    testPatients = shuffleIDs[trainSize:]
    return trainPatients, testPatients 

def trainSplitbyYear(patient_dict):
    trainPatients = []
    testPatients = []
    for personID, value in patient_dict.items():
        if value['primaryVisitTime'].year >= 2019:
            testPatients.append(personID) 
        else:
            trainPatients.append(personID)
    return trainPatients, testPatients
    
    
def comparePrecisionRecall(recommenderInstance, testPatients, steps):
    ri = recommenderInstance
    methodChoice = ['PPV_mod2_wt','PPV','Prevalence','BaselinePrevalence','Random']
    labels = ['PPV_weighted','PPV','Endocrine_Prevalence','Outpatient_Prevalence','Random']
    colorChoice = ['orange','lightgreen','skyblue','pink','lightgrey']
    plotChoice = list(zip(methodChoice, colorChoice))
    
    def plotPrecisionRecall(method, linecolor):
        precision_list = []
        recall_list = []
        for k in range(1,120,steps):
            print('k = ',k)
            precision, recall, auc = ri.testing(testPatients, method, k)
            precision_list.append(precision)
            recall_list.append(recall)
        plt.plot(recall_list, precision_list, color = linecolor, linestyle = ':', linewidth = 4)
  
    for method, color in plotChoice:
        plotPrecisionRecall(method, color)
    
    plt.legend(labels)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve at various k')
    plt.xlim([0.15, 1.0])
    plt.ylim([0.0, 0.8])
    plt.show()

def topOrdersPrevalence(recommenderInstance):
    RI = recommenderInstance
    DF = pd.DataFrame.from_dict(RI.trainingItems, orient = 'index')
    DF = DF.sort_values(by = ["prevalence"],ascending=False)
    DF["patients"] = DF["patients"].map(lambda x:len(x))
    DF["name"] = DF.index.map(lambda x:testRecommender.findItemName(x))
    DF = DF.reset_index()
    DF = DF[["name","instance_count","patients","prevalence","baselinefreq"]]
    return DF
    

def showDescriptiveStats(recommenderInstance):
    ri = recommenderInstance
    print("------------Data overview -------------")
    print("Patients: ", len(ri.patients))
    print("Instances : ", ri.instance_count)
    print("Items: ", ri.item_count) 
    print("---------------------------------------")
    print("Training patients: ", len(ri.trainPatients))
    print("Training items: ", len(ri.trainingItems))
    print("Candidate items: ", len(ri.candidateItems))
    print("---------------------------------------")        
    def countItems():
        InputItems = []
        OutputItems = []
        for p in ri.patients.values():
            if "itemsPre" in p:
                InputItems.append(len(set(p["itemsPre"])))
            if "itemsPost" in p:
                OutputItems.append(len(set(p["itemsPost"])))
        return InputItems, OutputItems
    InputItems, OutputItems = countItems()
    print("Avg number of query items: ", np.mean(InputItems))
    print("Median number of query items: ", np.median(InputItems))
    print("Avg number of output items: ", np.mean(OutputItems))
    print("Median number of output items: ", np.median(OutputItems))
    plt.hist(OutputItems, bins= 'auto')
    plt.show()
    
def topItemsGivenQuery(recommenderInstance, queryItems, n=20, method = 'PPV_mod2_wt'):
    
    col = ["PPV","RR","Prevalence","Baseline_Prevalence","Fisher"]
    RI = recommenderInstance
    queryItemNames = list(map(RI.findItemName, queryItems))
    print("clinical items for query: {}".format(queryItemNames))
    
    rankedItems, rankScore = RI.rankingItems(set(queryItems), method)
    
    if n+10 <= len(RI.candidateItems):
           k = n + 10
    else:
           k = n 
            
    df = pd.DataFrame(rankedItems[:k], columns=["Item"])
    df["Name"] = df["Item"].map(RI.findItemName)
    
    formatFunc1 = lambda x:round(x*100,1)
    formatFunc2 = lambda x:round(x,1)
    formatFunc3 = lambda x:format(10**(-x),'.2e')
    
    methodMap = {'ppv': ('ppv', formatFunc1),
                 'rr': ('RR',formatFunc2),
                 'prevalence':('prevalence',formatFunc2),
                 'baseline_prevalence':('baselineprevalence',formatFunc2),
                 'conditional_freq':('conditionalfreq',formatFunc2),
                 'fisher':('fisher',formatFunc3)}
    
    for c in col:
        df[c] = df["Item"].apply(RI.aggStats, args = (queryItems, methodMap[c.lower()][0]))
    
    
    # format/rounding
    for c in col:
        df[c] = df[c].apply(methodMap[c.lower()][1])
    
    return df 
    

In [None]:
# bar plot to compare different recommender method
def CompareRecommender(recommenderInstanceList, recommenderInstanceName, testPatients, 
                        plotlist = ['auc','precision','recall'],
                        #methodChoice = ['PPV_mod2_wt','PPV','Prevalence','BaselinePrevalence','Random'],
                        #labels = ['PPV_weighted','PPV','Endocrine_Prevalence','Outpatient_Prevalence','Random']):
                        methodChoice = ['PPV_mod2_wt','PPV_mod2','PPV_mod_wt','PPV_mod','PPV_wt','PPV','Prevalence','BaselinePrevalence','Random'],
                        labels = ['PPV_mod2_wt','PPV_mod2','PPV_mod_wt','PPV_mod','PPV_wt','PPV','Endocrine_Prevalence','Outpatient_Prevalence','Random']):
    
    # Set the colors
    colors = ['red','orange','blue','lightblue','green','lightgreen','skyblue','pink','lightgrey']
    
    def autolabel(bars, margin):
        # attach some text labels
        for bar in bars:
            width = bar.get_width()
            ax.text(width + margin, bar.get_y() + bar.get_height()/2,
                    '{:.2f}'.format(width),
                    ha='right', va='center')
    
    # construct DF to store result of different methods as rows, different recommender instance as columns)
    precisionDF = pd.DataFrame(columns=recommenderInstanceName, index = methodChoice)
    recallDF = pd.DataFrame(columns=recommenderInstanceName, index = methodChoice)
    aucDF = pd.DataFrame(columns=recommenderInstanceName, index = methodChoice)
    
    for i, ri in enumerate(recommenderInstanceList):        
        k = ri.k
        name = recommenderInstanceName[i]
        
        for method in methodChoice:
            precision, recall, auc = ri.testing(testPatients, method, k, returnMean = False)
            f = lambda x:x*100            
            precisionDF.loc[method, name] = list(map(f,precision))
            recallDF.loc[method, name] = list(map(f,recall))
            aucDF.loc[method, name] = auc
            
#         """paired t-test"""
#         for m, method in enumerate(methodChoice):
#             if m == 0:
#                 ref_precision = precisionDF.loc[method, name]
#                 ref_recall = recallDF.loc[method, name]
#                 ref_auc = aucDF.loc[method, name]
#             else:
#                 print("paired t-test for ",methodChoice[0], " and ", method, " :")
#                 t_precision, pval_precision = stats.ttest_rel(ref_precision, precisionDF.loc[method, name])
#                 t_recall, pval_recall = stats.ttest_rel(ref_recall, recallDF.loc[method, name])
#                 t_auc, pval_auc = stats.ttest_rel(ref_auc, aucDF.loc[method, name])
#                 print("precision: ", t_precision, pval_precision)
#                 print("recall: ", t_recall,  pval_recall)
#                 print("auc: ", t_auc, pval_auc)
#                 print("-----------------------------------------------")
    

    resultDict = {'auc':aucDF.applymap(mean),
                  'precision': precisionDF.applymap(mean),
                  'recall': recallDF.applymap(mean)}
    
    xlimDict = {'auc':[0.5, 1.0],
                'precision':[0,50],
                'recall':[0,50]}
                          
    formatDict = {'auc':'{:.2f}',
                  'precision':'{:.2f}%',
                  'recall':'{:.2f}%'}    
    
    for pl in plotlist:
        resultDF = resultDict[pl]
        ind = np.arange(resultDF.shape[0])

        # make the plots
        fig, ax = plt.subplots()
        bars = ax.barh(ind, resultDF[name], color = colors) # plot a vals
        ax.set_yticks(ind)  # position axis ticks
        ax.set_yticklabels(labels)  # set them to the names

        if pl == 'auc':
            plt.title('Different Recommender Ranking Methods')
            margin = 0.04
        else:
            plt.title('Different Recommender Ranking Methods (k = {})'.format(k))
            margin = 4
        plt.xlabel(pl)
        plt.xlim(xlimDict[pl])  
        autolabel(bars, margin)
        plt.show()

In [None]:
def CrossVal(recommenderInstance, Patients, methodChoice, fold):
    Patients = np.asarray(Patients)
    ri = recommenderInstance
    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=fold, random_state=None, shuffle=True)
    
    resultDF = pd.DataFrame(index = ['precision','recall','auc'], columns=methodChoice)
    resultDF = resultDF.applymap(lambda x:[])
    
    for train_index, test_index in kf.split(Patients): 
        ri.training(list(Patients[train_index]))
        for method in methodChoice:
            precision, recall, auc = ri.testing(list(Patients[test_index]), method, k=5, returnMean = True)
            resultDF.loc['precision', method].append(precision*100)
            resultDF.loc['recall', method].append(recall*100) 
            resultDF.loc['auc', method].append(auc)
    
    resultDF = resultDF.applymap(lambda x:mean(x))
    return resultDF

testR = Recommender()
testR.inputFeatures = {'measurement','procedure','condition','drug'}
testR.preProcessing(count_cutoff = 10)
testR.inspectPredictedItems = False
trainPatients, testPatients = trainSplitbyYear(testR.patients)
methodChoice = ['PPV_mod2_wt','PPV_mod2','PPV_mod_wt','PPV_mod','PPV_wt','PPV','Prevalence','BaselinePrevalence','Random']
resultDF = CrossVal(testR, trainPatients, methodChoice, fold = 5)
resultDF.columns = [['PPV (RR^2/N)','PPV (RR^2)','PPV (RR/N)','PPV (RR)',
                     'PPV (1/N)','PPV','Endocrine_Prevalence','Outpatient_Prevalence','Random']]
resultDF.round(2)

In [None]:
resultDF.columns = [['PPV (RR^2/N)','PPV (RR^2)','PPV (RR/N)','PPV (RR)',
                     'PPV (1/N)','PPV','Endocrine_Prevalence','Outpatient_Prevalence','Random']]
resultDF.round(3)