# Backend

This notebook loads the original/indexed data and w2vec models. It then launches a server backend which enables querying by first connecting via TCP, and then sending a series of commands, which the server responds to.

### Initialization and Imports

In [1]:
%run includes/imports.py
%load_ext autoreload
%autoreload 2
import os
from includes.stringop import StringOp
from includes.w2vec import W2VecModel
from includes.query_expander import QueryExpander

In [2]:
W2VEC = W2VecModel()
CHARGRAM = W2VecModel()
CHARGRAM.load_model('index/model_char.w2v')
W2VEC.load_model('index/model_word.w2v')
Expander = QueryExpander(W2VEC, CHARGRAM) 

In [3]:
data = pd.read_csv('data/NOTEEVENTS.csv', dtype={'TEXT': str}, usecols = ['TEXT'])

### Server definition

In [4]:
class Server():
    wlist = []
    neglist = []
    dicts = []
    verbosity = False
    
    def __init__(self, verbosity):
        '''
        Initializes the server.
        verbosity: determines if the server should perform extensive printing of intermediate results for debugging purposes.
        '''
        self.verbosity = verbosity
        with open('words/negative.txt') as f: #load list of negative words
            self.neglist = f.readlines()
        self.neglist = [str.lower(x.strip()) for x in self.neglist]
        with open('words/conditions.txt') as f: #load list of supported conditions
            wlist = f.readlines()
            wlist = [str.lower(x.strip()) for x in wlist]

        wordlist = []
        for w in wlist:
            wordlist.append(w.split('/'))
        self.wlist = wordlist
    
    #-----------------------
    #QUERY PROCESSING PART
    #-----------------------
    
    def read_index(self, filepath):
        '''
        Reads a single index file. Use when single-threaded indexing was performed previously.
        filepath: path to the index.json file.
        '''
        self.dicts = []
        with open(filepath) as f:
            for line in f:
                self.dicts.append(json.loads(line))
                
    def read_indices(self, dirpath):
        '''
        Reads multiple index files. Use when multi-threaded indexing was performed previously.
        dirpath: path to index directory, containing index{i}.json files.
        '''
        self.dicts = []
        files = list(sorted(glob.glob(os.path.join(dirpath, 'pindex*.json')))) 
        for file in files: 
            print('reading', file)
            f = open(file)
            for line in f:
                self.dicts.append(json.loads(line))
            f.close()
                
    def get_condition_attributes(self, query):
        '''
        Extracts condition and attributes from a given full-text query.
        query: the query to extract condition and attributes from.
        '''
        attributes = []
        allwords = query.split(' ')
        condition = ''
        resdict = StringOp.find_condition(self.wlist, query)
        for k,v in resdict.items():
            condition = k
            record = not (0 in v)
            word = ''
            for i in range(len(query)):
                if record and query[i]!=' ':
                    word+=query[i]
                if query[i]==' ':
                    if len(word)>0:
                        attributes.append(word)
                        word = ''
                    if (i+1) in v:
                        record=False
                    else:
                        record=True
            if len(word)>0: attributes.append(word)
        return condition, attributes
               
    def query(self, query, polarity=True, dosort=True, expand=True, verbose=False, damping=1):
        '''
        Executes a given query and returns results.
        query: the query to execute.
        polarity: the polarity of the query, either True if condition should be present, or False if not.
        expand: determines if the query should be expanded by using synonyms. if expanded, multiple queries will be run internally, and the aggregated result will be returned.
        verbose: determines level of verbosity.
        damping: determines the weight deduction applied to the extended queries. if <1, extended queries will experience extra downweighting. only applied if expand is set to True.
        '''
        if not expand: #simply call single query function
            return self.single_query(query, polarity, dosort, verbose=verbose)
        else: #expand attributes
            condition, attributes = self.get_condition_attributes(query)
            words_interest_condition = set(self.get_condition_synonyms(condition))
            words_interest_attributes = set(attributes)
            alternatives = []
            simscore = {}
            for att in attributes: #expand each attribute using the QueryExpander
                expansion = Expander.expand_word(att, syn_threshold=0.2, var_thres_sim=20, var_thres_dis=-1) 
                alternatives.append(expansion)
                for k,v in expansion.items():
                    if not k in simscore: simscore[k] = v
                    words_interest_attributes.update([k])
            combine = itertools.product(*alternatives) #creates all possible combinations of synonym queries
            if verbose:
                print('words of interest condition', words_interest_condition)
                print('words of interest attributes', words_interest_attributes)
            
            overall_result = np.full(len(self.dicts), float('-inf'))
            for c in combine:
                newquery = condition + ' ' + " ".join(c)
                query_weight = 1 #compute query weight based on the similarity score given by the QueryExpander
                for element in c:
                    query_weight*=simscore[element]
                query_weight = query_weight*damping #apply damping factor for the non-original queries
                if verbose:
                    print('alternative query:', newquery, ', weight:', query_weight)
                #run individual query
                qresult, wic, wia = self.single_query(newquery, polarity, dosort=False, verbose=verbose)
                vals = np.array(list(qresult.values()))*query_weight
                overall_result = np.maximum(overall_result, vals) #merging results by taking the max of all, element-wise
            ct=0
            for k,v in qresult.items(): #recycle last dict here
                qresult[k] = overall_result[ct]
                ct+=1
            if dosort:
                qresult = sorted(qresult.items(), key=operator.itemgetter(1),reverse=True)
            return qresult, words_interest_condition, words_interest_attributes
        
    def single_query(self, query, polarity=True, dosort=True, verbose=False):
        '''
        Executes a single query without performing any expansion, and returns the result.
        query: the query to execute.
        polarity: the polarity of the query, either True if condition should be present, or False if not.
        dosort: determines if search results should be sorted by the relevancy assigned, or left in their original order.
        verbose: determines level of verbosity.
        '''
        condition, attributes = self.get_condition_attributes(query)
        words_interest_condition = set(self.get_condition_synonyms(condition))
        words_interest_attributes = set(attributes)
        verbose=True
        if verbose:
            print('running query [' + ('positive' if polarity else 'negative') + ']')
            print('condition', condition)
            print('attributes', attributes)
            print('words of interest condition', words_interest_condition)
            print('words of interest attributes', words_interest_attributes)            
         
        results = {}
        index = 0
        for d in self.dicts:
            #cover trivial cases:
            if polarity and not condition in d:
                results[index] = -1 #irrelevant, condition not present at all
            elif polarity and condition in d and len(set(d[condition]) & set(self.neglist))>0: #looking for positive, classified as negative
                results[index] = -0.5
            elif not polarity and not condition in d: #condition not mentioned. it could be a negative, we can't tell.
                results[index] = 0
            elif not polarity and condition in d and not len(set(d[condition]) & set(self.neglist))>0: #looking for negative, but we classified as positive
                results[index] = -0.5
                
            #in all other cases, perform precise matching by comparing attributes
            elif condition in d: 
                matches_condition = d[condition]['-total-'] 
                matches_attributes = 0
                perc_matched = 1
                for a in attributes:
                    if a in d[condition]:
                        matches_attributes+=d[condition][a] 
                        perc_matched+=1
                perc_matched/=(len(attributes)+1)        
                results[index] = perc_matched        
            
            index+=1
        if dosort: results = sorted(results.items(), key=operator.itemgetter(1),reverse=True)
        return results, words_interest_condition, words_interest_attributes
    
    def query_complex(self, query, expand, verbose=False):
        '''
        Executes a complex query, i.e. a joint query with multiple criteria, and returns the result.
        query: the query to execute.
        expand: determines if the query should be expanded by using synonyms. if expanded, multiple queries will be run internally, and the aggregated result will be returned.
        verbose: determines level of verbosity.
        '''
    
        if query.startswith('('): query="+" + query
        words_interest_condition = set()
        words_interest_attributes = set()
        
        #parse complex query in lists of positive and negative queries
        indices = [m.start() for m in re.finditer('\(', query)]
        posqueries = []
        negqueries = []
        for ind in indices:
            polarity = query[ind-1]
            if polarity=='+':
                posqueries.append(query[ind+1:].split(')')[0])
            elif polarity=='-':
                negqueries.append(query[ind+1:].split(')')[0])
        
        if verbose:
            print('positive queries:', posqueries)
            print('negative queries:', negqueries)

        im_result = None
        overall_res = np.ones(len(self.dicts))
        
        for j in range(2):
            q_run = posqueries if j==0 else negqueries
            polar = True if j==0 else False
            
        
            for i in range(len(q_run)): #run all individual queries, also expanding all of them if requested
                condition, attributes = self.get_condition_attributes(q_run[i])
                words_interest_condition.update(self.get_condition_synonyms(condition))

                im_result, wic, wia = self.query(q_run[i], polarity=polar, dosort=False, expand=expand, verbose=verbose)
                words_interest_condition.update(wic)
                words_interest_attributes.update(wia)
                vals = np.array(list(im_result.values()))
                overall_res = overall_res+vals
        
        ct=0
        for k,v in im_result.items(): #just recycle last dict
            im_result[k] = overall_res[ct]
            ct+=1
        im_result = sorted(im_result.items(), key=operator.itemgetter(1), reverse=True)
        return im_result, words_interest_condition, words_interest_attributes
    
    def get_condition_synonyms(self, condition):
        '''
        Returns all synonyms known for a given condition.
        condition: the condition to find synonyms of.
        '''
        for lst in self.wlist:
            if condition in lst:
                return lst
        return None #if no condition identified.
    
    def find_summary(self, fulltext, query, wia, wic):
        '''
        Tries to find a summary sentence (i.e. most relevant part) of a full-text document, given the user query.
        fulltext: the whole medical report.
        query: the user query that was executed.
        wia: all words of interest for the condition.
        wic: all words of interest for the attributes.
        '''
        all_indices = set()
        for w in wic:
            indices = [m.start() for m in re.finditer(w.lower(), fulltext.lower())]
            all_indices.update(indices)
        #within all neighborhoods around those indices, find the ones that maximize the number of attributes around them
        nsize = 90
        highest_total = -1
        highest_ind = 0
        for ind in all_indices:
            subtext = fulltext[max(0,ind-nsize):min(len(fulltext), ind+nsize)]
            totalmatches = 0
            for w in wia:
                matches = len([m.start() for m in re.finditer(w, subtext)])
                if matches>0: totalmatches+=1
            if totalmatches>highest_total:
                highest_total = totalmatches
                highest_ind = ind
        return '...' + fulltext[int(max(0,highest_ind-nsize)):int(min(len(fulltext), highest_ind+nsize))].replace('@','').replace('|','').replace('\n', ' ').replace('-', '').replace('?', '') + '...'
    
    #-----------------------
    #NETWORK PART
    #-----------------------
    
    def send(self, s, conn):
        '''
        Sends the given string to a connected TCP-client.
        s: the string to send.
        conn: the connection to send to.
        '''
        buffer = str.encode(s)
        conn.send(str.encode(str(len(buffer))))
        conn.send(buffer)
        
    def handle_request(self, req, conn):
        '''
        Handles a specific request made by a client.
        req: the request received as a string.
        conn: the connection it was sent from.
        '''
        req = str(req, 'utf-8')
        print('handling ' + str(req) + '...')
        command = req.split(':')[0]
        args = req.split(':')[1]
        if command=='serve': #serve a specific report based on its index
            print('sending report ', args)
            self.send(data.iloc[int(args)]['TEXT'], conn)
        elif command.startswith('query'): #execute a query.
            args = args.replace(',', ' ')
            expa = 'expand' in command
            cplx = 'complex' in command
            print('executing query', args, "expand:", expa, "complex:", cplx)
            
            try:
                ranking, wic, wia = self.query(args, expand=expa, verbose=self.verbosity) if not cplx else self.query_complex(args, expand=expa, verbose=self.verbosity)
                
                if len(ranking)==0:
                    self.send('err_no_match', conn)
                else:
                    print('query completed, returning results...')
                    self.send('|'.join(wic.union(wia)) + '|', conn) #send words that are to highlight by the client
                    s = ''
                    for item in ranking[0:50]: #formulate result of top 50 results
                        s+=str(item[0]) + '@' + str(item[1]) + '@' + self.find_summary(data.iloc[item[0]]['TEXT'], args, wia, wic) + '|'
                    self.send(s[0:len(s)-1], conn) #send back results
            except:
                self.send('err_invalid', conn)
        else: #unknown command
            self.send('err_invalid', conn)

    def run(self, TCP_IP, TCP_PORT, BUFFER_SIZE=1024):
        '''
        Runs the server in blocking mode.
        TCP_IP: the IP to listen from.
        TCP_PORT: the TCP port to listen from.
        BUFFER_SIZE: the buffer size used.
        '''
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.bind((TCP_IP, TCP_PORT))
        s.listen(1)
        
        while(True):
            print('waiting for incoming connections...')
            conn, addr = s.accept()
            print('Connection address:', addr)
            try:
                while(True):
                    req = conn.recv(BUFFER_SIZE)
                    if not req: break
                    if len(req)<=1: continue #ignore tests if alive
                    self.handle_request(req, conn)

            except:
                print('connection closed.')
                conn.close()

### Executing the server

In [None]:
sv = Server(False)
sv.read_indices('index/')
sv.run('127.0.0.1', 5008)

### Sanity check: simple query

These can be executed from here without having to connect a client.

In [None]:
#simple query:
sv.query('hemorrhage acute', polarity=True, dosort=True, expand=True, verbose=True)

### Sanity check: complex query

Complex queries follow this form: \textit{(query 1)+(query 2)-(query 3)+(query 4)...}
Here, each query contains exactly one condition and an arbitrary number of attributes related to it. The "+" indicates that the given condition with its attributes has to be present, the "-" that is has to be absent. Always wrap queries in parenthesis.

In [None]:
#complex query:
sv.query_complex('(hemorrhage acute brain)-(hypertension)', expand=False, verbose=False)