# Read file all_unique_entities.json

In [2]:
import json
from pprint import pprint

# Specify the file path
file_path = '../all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")

# pretty print the fist 5 entities
print("First 5 items:")
for i, (key, value) in enumerate(all_entities.items()):
    if i >= 5:
        break
    pprint({key:value}, width=100)



Number of entities: 1250
First 5 items:
{'Lungs': {'label': 'ANAT-DP',
           'normalization': {'UMLS': {'definition': {'definition': 'Lobular organ the parenchyma '
                                                                   'of which consists of '
                                                                   'air-filled alveoli which '
                                                                   'communicate with the '
                                                                   'tracheobronchial tree. '
                                                                   'Examples: There are only two '
                                                                   'instances, right lung and left '
                                                                   'lung.',
                                                     'source': 'UWDA'},
                                      'name': 'Lungs',
                                      'semanticTypes': 'B

# Simple Analysis

In [25]:
from collections import Counter

# 统计每个label的entity数量
label_counts = Counter()
for entity, data in all_entities.items():
    label_counts[data['label']] += 1

# 统计总的entity数量
total_entities = len(all_entities)

# 找出出现次数最多的entity
most_common_entity = max(all_entities.items(), key=lambda x: len(x[1]['reports']))

# 计算平均报告数量
avg_reports = sum(len(data['reports']) for data in all_entities.values()) / total_entities

print(f"总entity数量: {total_entities}")
print("\n每个label的entity数量:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

print(f"\n出现次数最多的entity: '{most_common_entity[0]}' (出现在 {len(most_common_entity[1]['reports'])} 个报告中)")

# 统计有normalization和没有normalization的entity数量
with_norm = sum(1 for data in all_entities.values() if data['normalization'] is not None)
without_norm = total_entities - with_norm

print(f"\n有normalization的entity数量: {with_norm}")
print(f"没有normalization的entity数量: {without_norm}")


总entity数量: 1250

每个label的entity数量:
ANAT-DP: 363
OBS-DP: 712
OBS-DA: 93
OBS-U: 82

出现次数最多的entity: 'pleural' (出现在 424 个报告中)

有normalization的entity数量: 0
没有normalization的entity数量: 1250


# Read file all_unique_entities.json

In [27]:
import json
from pprint import pprint

# Specify the file path
file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")





from collections import Counter

# 统计每个label的entity数量
label_counts = Counter()
for entity, data in all_entities.items():
    label_counts[data['label']] += 1

# 统计总的entity数量
total_entities = len(all_entities)

# 找出出现次数最多的entity
most_common_entity = max(all_entities.items(), key=lambda x: len(x[1]['reports']))

# 计算平均报告数量
avg_reports = sum(len(data['reports']) for data in all_entities.values()) / total_entities

print(f"总entity数量: {total_entities}")
print("\n每个label的entity数量:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


# 统计有normalization和没有normalization的entity数量
with_norm = sum(1 for data in all_entities.values() if data['normalization'] is not None)
without_norm = total_entities - with_norm

print(f"\n有normalization的entity数量: {with_norm}")
print(f"没有normalization的entity数量: {without_norm}")
# percentage of entities with normalization
print(f"\n有normalization的entity占比: {with_norm / total_entities:.2%}")


# 统计每个label的有normalization和没有normalization的entity数量
label_counts_with_norm = Counter()
label_counts_without_norm = Counter()
for entity, data in all_entities.items():
    if data['normalization'] is not None:
        label_counts_with_norm[data['label']] += 1
    else:
        label_counts_without_norm[data['label']] += 1

print("\n每个label的有normalization的entity数量:")
for label, count in label_counts_with_norm.items():
    print(f"{label}: {count}")
    # percentage of entities with normalization
    print(f"{label}的有normalization的entity占比: {count / label_counts[label]:.2%}")


print("\n每个label的没有normalization的entity数量:")
for label, count in label_counts_without_norm.items():
    print(f"{label}: {count}")
    


Number of entities: 1250
总entity数量: 1250

每个label的entity数量:
ANAT-DP: 363
OBS-DP: 712
OBS-DA: 93
OBS-U: 82

有normalization的entity数量: 498
没有normalization的entity数量: 752

有normalization的entity占比: 39.84%

每个label的有normalization的entity数量:
ANAT-DP: 152
ANAT-DP的有normalization的entity占比: 41.87%
OBS-DP: 268
OBS-DP的有normalization的entity占比: 37.64%
OBS-DA: 41
OBS-DA的有normalization的entity占比: 44.09%
OBS-U: 37
OBS-U的有normalization的entity占比: 45.12%

每个label的没有normalization的entity数量:
ANAT-DP: 211
OBS-DP: 444
OBS-DA: 52
OBS-U: 45


In [None]:
import json
from pprint import pprint

# Specify the file path
file_path = '../resource/all_unique_entities.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")

# pretty print the fist 5 entities
print("First 5 items:")
for i, (key, value) in enumerate(all_entities.items()):
    if i >= 5:
        break
    pprint({key:value}, width=100)



# First normalization

In [20]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")


# Count the number of items with normalization
count_with_normalization = sum(1 for data in all_entities.values() if data['normalization'] is not None)
print(f"Number of items with normalization: {count_with_normalization}")

# Count the number of items with normalization: null
count_null_normalization = sum(1 for data in all_entities.values() if data['normalization'] is None)
print(f"Number of items with normalization: null: {count_null_normalization}")


count_no_results = sum(1 for data in all_entities.values() if data['normalization'] and data['normalization']['UMLS'] and data['normalization']['UMLS']['definition']['definition'] == "No results")
print(f"\nNumber of items with normalization.umls.ui == 'No results': {count_no_results}")
print('This means there is even no candidate in UMLS for this entity')


Total number of entities: 1250
Number of items with normalization: 498
Number of items with normalization: null: 752

Number of items with normalization.umls.ui == 'No results': 120
This means there is even no candidate in UMLS for this entity


## Semantic Types Analysis

In [20]:
semantic_types = set()

for entity, data in all_entities.items():
    if data.get('normalization') and data['normalization'].get('UMLS'):
        semantic_types.add(data['normalization']['UMLS']['semanticTypes'])

num_semantic_types = len(semantic_types)

print(f"Number of different unique kinds of items with normalization.umls.semanticTypes: {num_semantic_types}")

print("Unique kinds of items with normalization.umls.semanticTypes:")
print(semantic_types)

Number of different unique kinds of items with normalization.umls.semanticTypes: 38
Unique kinds of items with normalization.umls.semanticTypes:
{'Therapeutic or Preventive Procedure', 'Qualitative Concept', 'Chemical Viewed Functionally', 'Functional Concept', 'Body Substance', 'Finding', 'Pathologic Function', 'Cell Component', 'Anatomical Abnormality', 'Acquired Abnormality', 'Injury or Poisoning', 'Tissue', 'Body Space or Junction', 'Body Location or Region', 'Activity', 'Idea or Concept', 'Pharmacologic Substance', 'Temporal Concept', 'Intellectual Product', 'Substance', 'Body Part, Organ, or Organ Component', 'Quantitative Concept', 'Natural Phenomenon or Process', 'Manufactured Object', 'Medical Device', 'Disease or Syndrome', 'Phenomenon or Process', 'Patient or Disabled Group', 'Spatial Concept', 'Mental or Behavioral Dysfunction', 'Hazardous or Poisonous Substance', 'Social Behavior', 'Conceptual Entity', 'No results', 'Research Activity', 'Chemical Viewed Structurally', 'Ana

# Entity with no normalizations

In [6]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")

# get list of all entities with normalization: null
entities_null_normalization = [entity for entity, data in all_entities.items() if data['normalization'] is None]
# print(f"Entities with normalization: null: {entities_null_normalization}")

# len(entities_null_normalization)
print(f"Number of entities with normalization: null: {len(entities_null_normalization)}")

Total number of entities: 1250
Number of entities with normalization: null: 752


In [7]:
pprint(entities_null_normalization)

['cardiomediastinal',
 'silhouettes',
 'surfaces',
 'approximately a 4.6 cm',
 'carina',
 'distal',
 'advanced by at least 11 cm',
 'contours',
 'grossly',
 'unremarkable',
 'Pulmonary',
 'apex',
 'slightly',
 "patient 's chin",
 'displaced',
 'osseous',
 'Standard',
 'positioning',
 'mid',
 'portion',
 'body',
 'The side - port',
 'junction',
 'Otherwise',
 'change',
 'stable',
 'ET',
 '4.3 cm',
 'NG',
 'malpositioned',
 'position',
 'enteric',
 'approximately 1.8 cm',
 'barely',
 'side ports',
 'approximately 12 cm',
 'approximately 2 cm',
 'overt',
 'CHF',
 'silhouette',
 'Limited',
 'This',
 'cardiac',
 'mildly',
 'unchanged',
 'bronchovascular',
 'pulmonary',
 'patchy',
 'bibasilar',
 'opacities',
 'abnormalities',
 'Patchy',
 'parenchymal',
 'opacity',
 'frontal',
 'radiograph',
 'scoliosis',
 'asymmetry',
 'ribcage',
 'Mildly',
 'underinflated',
 'largely',
 'Osseous',
 'The lateral view is limited',
 "patient 's arms",
 'lower',
 'They',
 'conspicuous',
 'recent exam from ___',

# CSV

In [23]:
import csv

# Specify the file path
csv_file_path = '../output.csv'

# Extract the required fields from all_entities
data = []

for key, value in all_entities.items():
    name = key
    ui = None
    definition = None
    semanticTypes = None
    normalized_name = None

    if value.get('normalization') and value['normalization'].get('UMLS'):
        umls = value['normalization']['UMLS']
        ui = umls.get('ui')
        normalized_name = umls.get('name')
        semanticTypes = umls.get('semanticTypes')
        definition = umls.get('definition', {}).get('definition')

    data.append({
        'name': name,
        'ui': ui,
        'normalized_name': normalized_name,
        'semanticTypes': semanticTypes,
        'definition': definition
    })


# Write the data to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Data has been written to {csv_file_path}")

Data has been written to ../output.csv


In [5]:
# import csv

# # Specify the file path
# csv_file_path = '../gpt_normalization.csv'



# import json
# from pprint import pprint

# file_path = '../resource/all_unique_entities_normalized.json'

# # Read the JSON file
# with open(file_path, 'r', encoding='utf-8') as f:
#     all_entities = json.load(f)

# # get list of all entities with normalization: null
# entities_null_normalization = [entity for entity, data in all_entities.items() if data['normalization'] is None]


# data = []

# import sys
# import os
# from pprint import pprint

# # Add the parent directory to the Python path
# parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
# sys.path.append(parent_dir)

# from utils.umls_api import normalize_entity 
# from utils.umls_api import use_umls_api_term

# for entity in entities_null_normalization:
    
#     response = use_umls_api_term(entity)
#     result_list = response['result']['results'][:5]

#     ui, name = normalize_entity(entity, result_list)

#     data.append({
#         'name': entity,
#         'ui': ui,
#         'normalized_name': name,
#     })



# # Write the data to the CSV file
# with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
#     fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerows(data)

# print(f"Data has been written to {csv_file_path}")

Data has been written to ../gpt_normalization.csv


In [None]:
import csv

# Specify the file path
csv_file_path = '../gpt_normalization.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

# Print the data
# print(data)


count_normalizable = sum(1 for entry in data if entry['ui'] != 'unnormalizable')
print(f"Number of entries with 'normalizable' ui: {count_normalizable}")


normalizable = [entry for entry in data if entry['ui'] != 'unnormalizable']
for entry in normalizable:
    print(entry)



# Analysis after using the normalization with GPT

In [22]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")
# get list of all entities with normalization: null
entities_null_normalization = [entity for entity, data in all_entities.items() if data['normalization'] is None]
# len(entities_null_normalization)
print(f"Number of entities with normalization: null: {len(entities_null_normalization)}")


import csv

# Specify the file path
csv_file_path = '../gpt_normalization.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)


count_normalizable = sum(1 for entry in data if entry['ui'] != 'unnormalizable')
print(f"Number of entries with 'normalizable' ui: {count_normalizable}")


# we have stiil 752 - 390 = 362 entities that are not normalizable
print(f"we still have {len(entities_null_normalization) - count_normalizable} entities that are not normalizable")

Total number of entities: 1250
Number of entities with normalization: null: 752
Number of entries with 'normalizable' ui: 390
we still have 362 entities that are not normalizable


In [33]:
import time

# Specify the file path
csv_file_path = '../resource/gpt_normalization.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

from utils.umls_api import umls_id2semantic
from utils.umls_api import umls_id2definition

result_list = []
for entry in data:
    if entry['ui'] != 'unnormalizable':
        entry['semanticTypes'] = umls_id2semantic(entry['ui'])
        entry['definition'], source = umls_id2definition(entry['ui'])
        result_list.append(entry)
    
    # sleep for 0.1 second
    time.sleep(0.1)

csv_file_path = '../resource/output_2.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(result_list)

print(f"Data has been written to {csv_file_path}")

Query failed: C0277909, Status code: 404
Query failed: C0750482, Status code: 404
Query failed: C0012727, Status code: 404
Query failed: C1268086, Status code: 404
Query failed: C4319952, Status code: 404
Query failed: C1548447, Status code: 404
Query failed: C2326513, Status code: 404
Query failed: C2315107, Status code: 404
Query failed: C1265876, Status code: 404
Query failed: C1265876, Status code: 404
Query failed: C1320706, Status code: 404
Query failed: C4740697, Status code: 404
Query failed: C2607943, Status code: 404
Query failed: C0442808, Status code: 404
Query failed: C0205277, Status code: 404
Query failed: C2215609, Status code: 404
Query failed: C0442816, Status code: 404
Query failed: C0232118, Status code: 404
Query failed: C1283169, Status code: 404
Query failed: C2711450, Status code: 404
Query failed: C1269845, Status code: 404
Query failed: C1186983, Status code: 404
Query failed: C1527361, Status code: 404
Query failed: C0549186, Status code: 404
Query failed: C0

In [None]:
csv_file_path = '../resource/output_2.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)


for entry in data:
    definition = entry['semanticTypes']
    print(definition)

    # 去掉开头的括号和引号
    new_definition = definition.strip("'\"")

    # 更新 data 中的定义
    entry['semanticTypes'] = new_definition

    print(entry['semanticTypes'])


csv_file_path2 = '../resource/output_4.csv'

# Write the data to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

print(f"Data has been written to {csv_file_path}")


In [62]:
import csv
csv_file_path = '../resource/output_2.csv'

def clean_csv(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Write the header
        header = next(reader)
        writer.writerow(header)

        # Process each row
        for row in reader:
            print(row[-1])
            # Check if the last column matches the pattern
            if row[-1] == "None, None)":
                row[-1] = 'None'
            print(row[-1])

            writer.writerow(row)

# Usage
csv_file_path2 = '../resource/output_4.csv'
clean_csv(csv_file_path, csv_file_path2)
print("CSV file has been processed and saved.")

A ridge or ridge-like structure. In humans it usually refers to the trachea. The carina of trachea is part of the lowest tracheal cartilage which is placed between the orifices of the two bronchi.
A ridge or ridge-like structure. In humans it usually refers to the trachea. The carina of trachea is part of the lowest tracheal cartilage which is placed between the orifices of the two bronchi.
Localized away from the central point of the body. [https://orcid.org/0000-0002-0736-9199]
Localized away from the central point of the body. [https://orcid.org/0000-0002-0736-9199]
The outline of a part; the surface configuration.
The outline of a part; the surface configuration.
In pathology, a term that is used to describe a tissue specimen that has a normal appearance.
In pathology, a term that is used to describe a tissue specimen that has a normal appearance.
Lobular organ the parenchyma of which consists of air-filled alveoli which communicate with the tracheobronchial tree. Examples: There a

In [2]:
import csv
csv_file_path = '../resource/output_4.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)


csv_file_path = '../resource/output.csv'
data0 = []
# read the data from the CSV file
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data0.append(row)

# for all entities in data0, if the entity is in data, then update entity in data0 with the entity in data
for entity in data0:
    for entity2 in data:
        if entity['name'] == entity2['name']:
            entity['ui'] = entity2['ui']
            entity['normalized_name'] = entity2['normalized_name']
            entity['semanticTypes'] = entity2['semanticTypes']
            entity['definition'] = entity2['definition']
            # print found and replaced entity
            print(entity)

# Write the data0 back to the CSV file
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data0)


{'name': 'Lungs', 'ui': 'C0024109', 'normalized_name': 'Lungs', 'semanticTypes': 'Body Part, Organ, or Organ Component', 'definition': 'Lobular organ the parenchyma of which consists of air-filled alveoli which communicate with the tracheobronchial tree. Examples: There are only two instances, right lung and left lung.'}
{'name': 'clear', 'ui': 'C2963144', 'normalized_name': 'clear', 'semanticTypes': 'No results', 'definition': 'No results'}
{'name': 'Normal', 'ui': 'C0205307', 'normalized_name': 'Normal', 'semanticTypes': 'Qualitative Concept', 'definition': 'In pathology, a term that is used to describe a tissue specimen that has a normal appearance.'}
{'name': 'cardiomediastinal', 'ui': '', 'normalized_name': '', 'semanticTypes': '', 'definition': ''}
{'name': 'hilar', 'ui': 'C0205150', 'normalized_name': 'hilar', 'semanticTypes': 'Spatial Concept', 'definition': 'Refers to the area associated with the hilum.'}
{'name': 'silhouettes', 'ui': '', 'normalized_name': '', 'semanticTypes'