# Read enetiy file

In [71]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")

# Print first 2 entities
print("\nFirst 2 entities:")
for i, (key, value) in enumerate(list(all_entities.items())[:2], 1):
    print(f"{i}.")
    pprint({key: value}, width=100, indent=2)

Total number of entities: 1250

First 2 entities:
1.
{ 'Lungs': { 'label': 'ANAT-DP',
             'normalization': None,
             'reports': [ {'p18/p18004941/s58821758.txt': {'end_ix': 36, 'start_ix': 36}},
                          {'p18/p18003081/s53302126.txt': {'end_ix': 52, 'start_ix': 52}},
                          {'p18/p18001922/s52638004.txt': {'end_ix': 46, 'start_ix': 46}},
                          {'p18/p18001922/s52288833.txt': {'end_ix': 52, 'start_ix': 52}},
                          {'p15/p15165563/s51659523.txt': {'end_ix': 110, 'start_ix': 110}},
                          {'p15/p15004061/s56845046.txt': {'end_ix': 42, 'start_ix': 42}},
                          {'p15/p15078112/s51228277.txt': {'end_ix': 75, 'start_ix': 75}},
                          {'p15/p15078112/s58703686.txt': {'end_ix': 66, 'start_ix': 66}},
                          {'p18/p18026902/s53920289.txt': {'end_ix': 63, 'start_ix': 63}},
                          {'p18/p18026902/s51741672.txt':

## Suppoed result

```json
            "normalization":{
                                "UMLS" : {
                                    "ui": "C0000005",
                                    "name": "Blood",
                                    "definition": {
                                        "definition": "The fluid that circulates in the vascular system of a living organism.",
                                        "source": "MSH"
                                        }
                                    }
                            }
```

## Normalizing the entity file

### Auxilary functions

In [68]:
import requests
from pprint import pprint

# 设置API密钥和基础URL
API_KEY = "751b12fd-192a-4a6d-985d-9b094c99d3c8"
BASE_URL = "https://uts-ws.nlm.nih.gov/rest"


# return json
def search_umls_api(term, version="current"):
    url = f"{BASE_URL}/search/{version}"
    params = {
        "string": term,
        "apiKey": API_KEY
    }
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"查询失败: {term}, 状态码: {response.status_code}")
        return None

import Levenshtein

# TODO for further development
def best_match(entity, results_list):
    # for now just return the first result
    result = results_list[0]

    # calculate the levenshtein distance between entity and result['name']
    # if distance is less than a threshold, return result['ui'], result['uri']
    # else return 'No results', 'No results'
    threshold = 3

    for result in results_list:
        distance = Levenshtein.distance(entity, result['name']) # Levenshtein NameError: name 'Levenshtein' is not defined
        if distance < threshold:
            return result['ui'], result['uri']

    return 'No results', 'No results'


# function use search_umls to return (term, ui, uri)
def process_entity_response(entity, response):
    if response and 'result' in response and 'results' in response['result']:        
        results_list = response['result']['results']

        if results_list:

            ui, uri = best_match(entity, results_list)

            return entity, ui, uri
        else:
            return entity, 'No results', 'No results'

    else:
        return entity, 'Query failed', 'Query failed'

# Example usage:
# 设置API密钥和基础URL
API_KEY = "751b12fd-192a-4a6d-985d-9b094c99d3c8"
BASE_URL = "https://uts-ws.nlm.nih.gov/rest"

entity = 'Lungs'

response = search_umls_api(entity)
# pprint(response)

term, ui, uri = process_entity_response(entity, response)
print(term, ui, uri)


# 'https://uts-ws.nlm.nih.gov/rest/content/2024AA/CUI/C3825187'
# return json
def umls_api(url):
    # url could be 'No results', 'NONE'
    if url == 'No results' or url == 'NONE':
        return 'No results'

    params = {
        "apiKey": API_KEY
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"查询失败: {term}, 状态码: {response.status_code}")
        return None


# fuction to process response of umls_api() to return (semanticTypes, definition, rootSource)
def search_term_def(response):
    # url could be 'No results', 'Query failed'
    if response == 'No results' or response == 'NONE' or response == 'Query failed':
        return 'No results', 'No results', 'No results'

    if 'result' not in response:
        return 'No results', 'No results', 'No results'

    # print(data)
    semanticTypes = response['result']['semanticTypes'][0]['name']
    definitions_url = response['result']['definitions']


    data = umls_api(definitions_url)
    # print(data)

    if data == 'No results':
        return 'No results', 'No results', 'No results'

    if data and 'result' in data:
        results_list = data['result']

        if results_list:

            # choose the first definition
            best_match = results_list[0]
            
            return semanticTypes, best_match['value'], best_match['rootSource']
        else:
            return 'No results', 'No results', 'No results'
    else:
        return 'No results', 'No results', 'No results'
# Example usage:

pprint(search_term_def(umls_api(uri)))


Lungs C0024109 https://uts-ws.nlm.nih.gov/rest/content/2024AA/CUI/C0024109
('Body Part, Organ, or Organ Component',
 'Lobular organ the parenchyma of which consists of air-filled alveoli which '
 'communicate with the tracheobronchial tree. Examples: There are only two '
 'instances, right lung and left lung.',
 'UWDA')


# Out-of-the-box Functions

In [69]:
# return dic of result
def nomralize_entity_with_umls(entity):
    # Search UMLS with entity    
    term, ui, uri = process_entity_response(entity, search_umls_api(entity))

    # Process term definition
    semanticTypes, definition, source = search_term_def(umls_api(uri))

    # Create normalization dictionary
    normalization = {
        "UMLS": {
            'ui': ui,
            'name': term,
            'semanticTypes': semanticTypes,
            'definition': {
                'definition': definition,
                'source': source
            }
        }
    }
    
    # Set normalization if no results found
    if ui != 'No results':
        entity_value['normalization'] = normalization
    
    return normalization

# Example usage:
result = nomralize_entity_with_umls("Lungs")
pprint(result)

{'UMLS': {'definition': {'definition': 'Lobular organ the parenchyma of which '
                                       'consists of air-filled alveoli which '
                                       'communicate with the tracheobronchial '
                                       'tree. Examples: There are only two '
                                       'instances, right lung and left lung.',
                         'source': 'UWDA'},
          'name': 'Lungs',
          'semanticTypes': 'Body Part, Organ, or Organ Component',
          'ui': 'C0024109'}}


# Main Programm

In [70]:
import json
import pprint

# Specify the file path
file_path = '../resource/all_unique_entities.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

import json
import pprint
import time

# newEntity = {}
# i = 0

# 遍历顶层字典中的所有项目
for entity, entity_value in all_entities.items():

    normalization = nomralize_entity_with_umls(entity)

    if normalization['UMLS']['ui'] != 'No results':
        # set value['normalization']
        entity_value['normalization'] = normalization

    time.sleep(0.1)

    # newEntity[entity] = entity_value
    # i += 1
    # if i > 5:
    #     break
    

# 保存新的 newEntity 数据
with open('../resource/all_unique_entities_normalized.json', 'w') as f:
    json.dump(all_entities, f, indent=2)

print("数据已保存到 '../resource/all_unique_entities_normalized.json 文件中。")

数据已保存到 '../resource/all_unique_entities_normalized.json 文件中。
