In [1]:
# This script contains functions for query expansion

In [11]:
import requests
import json
import pandas as pd
import re
import xml.etree.ElementTree as ET

In [12]:
apiKey = "Bearer " + "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdSI6Imx4ZzphcGkiLCJzYyI6WyJrZzpyZWFkIiwiZXh0cmFjdGlvbjpyZWFkIl0sImFpIjoiYXBpOmM5NjUxNmRmLWYyZjYtMDRhNC1mYzVjLWQ5MmFjZGM0ZWZjMSIsInVpIjoidXNlcjpjZmYxMjM0MS1lN2FmLWEzMmUtNjM3YS0yNjFlMjRjZmVkZDAiLCJpYXQiOjE1ODc0MDM1MjF9.EsFfBoTNKl2TVgEwHF_qs8n2gkgwqJVNB0MiHLzM2P0"

In [13]:
def find_lexigram_id(disease):
    '''
    Function to return the lexigram ID in the KB
    Args:
        A disease in text format
    Returns:
        Its lexigram ID
    '''
    
    url = "https://api.lexigram.io/v1/lexigraph/search?q=" + disease
    r = requests.get(url, headers={'Authorization': apiKey})
    response = json.loads(r.text)
    
    if response["totalHitsCount"] != 0:
        # use the first result (with highest score)
        return( response['conceptSearchHits'][0]['concept']['id'])
    else:
        # cannot find corresponding Lexigram ID
        return 0

In [14]:
def cleanText(disease):
    
    '''
    expansion terms cleaning using ideas from team HPI-DHC
    '''
    
    stopwords = ["classification", "international", "no oncology", "subtype", "morphology", " - category", "ca - "]
    # downcase
    cleaned_text = disease.lower()
    # remove stuff in parentheses
    cleaned_text =  re.sub(r'\(.*?\)',"",cleaned_text)
    # remove stuff in square brackets
    cleaned_text =  re.sub(r'\[.*?\]',"",cleaned_text)
    # remove stuff after comma
    cleaned_text = cleaned_text.split(",")[0]
    # remove noise
    for stopword in stopwords:
        cleaned_text = cleaned_text.replace(stopword, "")
    # remove punctuation
    cleaned_text = re.sub(r"[^\w\s]", "", cleaned_text)
    # remove multiple space
    cleaned_text = re.sub(r' +', ' ', cleaned_text)
    
    return cleaned_text.strip()

In [15]:
def find_expansion_terms(disease, conceptGraphId):
    
    '''
    Function to return the sy
    Args:
        A disease in lexigram ID
    Returns:
        A lit of disease synonyms or preferred terms
    '''
    
    if conceptGraphId != 0:
        url = "https://api.lexigram.io//v1/lexigraph/concepts/" + conceptGraphId
        r = requests.get(url, headers={'Authorization': apiKey})
        response = json.loads(r.text) 
        disease_expansion_terms = [] 
        disease_expansion_terms.append(response['label']) # add preferred term
        disease_expansion_terms.extend(response['synonyms']) # add synonyms

        # clean the text
        for i in range(0, len(disease_expansion_terms)):
            disease_expansion_terms[i] = cleanText(disease_expansion_terms[i])

        # the original term should not be in the list
        if disease in disease_expansion_terms:
            disease_expansion_terms.remove(disease)

        # return a set without duplicate
        return list(set(disease_expansion_terms))
    else:
        return []

In [16]:
def read_acronyms(year):
    
    tree = ET.parse("../../data/topics/topics"+str(year)+"_acronyms.xml")
    root = tree.getroot()
    
    results = {}
    count = 1
    
    for child in root:
        acronym = child[0].text
        if acronym != None:
            results[count] = acronym.split(" ")
        else:
            results[count] = []
        count += 1
    return results

In [17]:
def load_gene_kb(path):
    
    '''
    load a gene knowledge base
    Returns {gene: {"Synonyms": ... , "description": ...}}
    '''
    
    gene_df = pd.read_csv(path, sep="\t", index_col=False)
    gene_df = gene_df[["Symbol", "Synonyms", "description"]]
    
    gene_dic = {}
    for index, row in gene_df.iterrows():
        results = {}
        results["Synonyms"] = gene_df.loc[index,"Synonyms"].split("|")
        results["description"] = gene_df.loc[index,"description"]
        gene_dic[gene_df.loc[index,"Symbol"]] = results
    
    return gene_dic

In [18]:
def expand_gene(gene, gene_dic):
    
    '''
    Expand gene terms with synonyms
    '''
    
    if gene in gene_dic.keys():
        return gene_dic[gene]['Synonyms']
    else:
        return []

In [19]:
def retrieve_gene_full_name(gene, gene_dic):
    
    '''
    Find the full name of a gene, may not be used
    '''
    
    if gene in gene_dic.keys():
        return [gene_dic[gene]['description']]
    else:
        return []