In [1]:
import pandas as pd
import json
from rapidfuzz import fuzz, process
import urllib.request

In [2]:
def get_json(url):
    opener = urllib.request.build_opener()
    opener.addheaders = [('Authorization', 'apikey token=' + apikey)]
    return json.loads(opener.open(url).read())

In [3]:
def get_concept(term):
    print(term)
    REST_URL = "http://data.bioontology.org"
    
    # Search NCBO BioPortal
    res = get_json(REST_URL + "/search?q=" + term.replace("'s",'%27s').replace(' ','%20') + parameters)["collection"]
    
    syn = {}
    for r in res:
        pref_label = r.get('prefLabel')  # Gets preferred label
        labels = r.get('synonym')  # Gets synonyms if available
        code = r.get('@id').split('/')[-1]  # Gets concept code
        syn[pref_label] = code
        if labels is not None:
            for label in labels:
                syn[label] = code

    # Checks similarity of each returned concept against the user-entered term
    matches = process.extract(term, syn.keys(), scorer=fuzz.token_sort_ratio, limit=10)

    d = {}
    for m in matches:
        concept_label = m[0]
        concept_code = syn[m[0]]
        score = m[1]
        d.update({concept_label: [concept_code, score]})

    # Convert results to dataframe
    df = pd.DataFrame.from_dict(d, orient='index').reset_index()
    df = df.rename(columns={'index': 'concept_name', 0: 'concept_code', 1: 'match_score'})
    df['source_term'] = term
    df = df[['source_term', 'concept_name', 'concept_code', 'match_score']]
    df.columns = [col if col == 'source_term' else '{}_'.format(target) + col for col in df.columns]
    
    if df['{}_match_score'.format(target)].max() > 99:
        # Returns one result if there is a complete match
        df = df.nlargest(1, '{}_match_score'.format(target))

    return df

In [4]:
def get_matches(term):
                
    print('Finding matches...')
    all_matches = get_concept(term)
    
    return all_matches

In [5]:
with open('bioportal_apikey.txt', 'r') as f:
    apikey = f.read()

In [6]:
target = input('Enter target vocabulary: ').upper()
parameters = "&ontologies={}".format(target) + "&include=prefLabel,synonym"

Enter target vocabulary: snomedct


In [7]:
get_matches('hypertension')

Finding matches...
hypertension


Unnamed: 0,source_term,SNOMEDCT_concept_name,SNOMEDCT_concept_code,SNOMEDCT_match_score
0,hypertension,Hypertension,38341003,100.0


In [8]:
get_matches('long qt interval')

Finding matches...
long qt interval


Unnamed: 0,source_term,SNOMEDCT_concept_name,SNOMEDCT_concept_code,SNOMEDCT_match_score
0,long qt interval,Prolonged QT interval,111975006,86.486486
1,long qt interval,QT interval,81435004,81.481481
2,long qt interval,Prolonged TU interval,251214008,81.081081
3,long qt interval,Sloping PR interval,251255006,74.285714
4,long qt interval,ST interval,40602005,74.074074
5,long qt interval,TU interval,251213002,74.074074
6,long qt interval,Prolonged P-R interval,164947007,73.684211
7,long qt interval,QRS interval,39632005,71.428571
8,long qt interval,Prolonged QT interval (finding),111975006,71.111111
9,long qt interval,Shortened ST interval,69318006,70.27027
