# Read file all_unique_entities.json

In [2]:
import json
from pprint import pprint

# Specify the file path
file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")

# pretty print the fist 5 entities
print("First 5 items:")
for i, (key, value) in enumerate(all_entities.items()):
    if i >= 5:
        break
    pprint({key:value}, width=100)


Number of entities: 1250
First 5 items:
{'Lungs': {'label': 'ANAT-DP',
           'normalization': {'UMLS': {'definition': {'definition': 'Lobular organ the parenchyma '
                                                                   'of which consists of '
                                                                   'air-filled alveoli which '
                                                                   'communicate with the '
                                                                   'tracheobronchial tree. '
                                                                   'Examples: There are only two '
                                                                   'instances, right lung and left '
                                                                   'lung.',
                                                     'source': 'UWDA'},
                                      'name': 'Lungs',
                                      'semanticTypes': 'B

# Simple Analysis

In [25]:
from collections import Counter

# 统计每个label的entity数量
label_counts = Counter()
for entity, data in all_entities.items():
    label_counts[data['label']] += 1

# 统计总的entity数量
total_entities = len(all_entities)

# 找出出现次数最多的entity
most_common_entity = max(all_entities.items(), key=lambda x: len(x[1]['reports']))

# 计算平均报告数量
avg_reports = sum(len(data['reports']) for data in all_entities.values()) / total_entities

print(f"总entity数量: {total_entities}")
print("\n每个label的entity数量:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

print(f"\n出现次数最多的entity: '{most_common_entity[0]}' (出现在 {len(most_common_entity[1]['reports'])} 个报告中)")

# 统计有normalization和没有normalization的entity数量
with_norm = sum(1 for data in all_entities.values() if data['normalization'] is not None)
without_norm = total_entities - with_norm

print(f"\n有normalization的entity数量: {with_norm}")
print(f"没有normalization的entity数量: {without_norm}")


总entity数量: 1250

每个label的entity数量:
ANAT-DP: 363
OBS-DP: 712
OBS-DA: 93
OBS-U: 82

出现次数最多的entity: 'pleural' (出现在 424 个报告中)

有normalization的entity数量: 0
没有normalization的entity数量: 1250


# Read file all_unique_entities.json

In [1]:
import json
from pprint import pprint

# Specify the file path
file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")


from collections import Counter

# 统计每个label的entity数量
label_counts = Counter()
for entity, data in all_entities.items():
    label_counts[data['label']] += 1

# 统计总的entity数量
total_entities = len(all_entities)

# 找出出现次数最多的entity
most_common_entity = max(all_entities.items(), key=lambda x: len(x[1]['reports']))

# 计算平均报告数量
avg_reports = sum(len(data['reports']) for data in all_entities.values()) / total_entities

print("\n每个label的entity数量:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


# 统计有normalization和没有normalization的entity数量
with_norm = sum(1 for data in all_entities.values() if data['normalization'] is not None)
without_norm = total_entities - with_norm

print(f"\n有normalization的entity数量: {with_norm}")
print(f"没有normalization的entity数量: {without_norm}")
# percentage of entities with normalization
print(f"\n有normalization的entity占比: {with_norm / total_entities:.2%}")


# 统计每个label的有normalization和没有normalization的entity数量
label_counts_with_norm = Counter()
label_counts_without_norm = Counter()
for entity, data in all_entities.items():
    if data['normalization'] is not None:
        label_counts_with_norm[data['label']] += 1
    else:
        label_counts_without_norm[data['label']] += 1

print("\n每个label的有normalization的entity数量:")
for label, count in label_counts_with_norm.items():
    print(f"{label}: {count}")
    # percentage of entities with normalization
    print(f"{label}的有normalization的entity占比: {count / label_counts[label]:.2%}")


print("\n每个label的没有normalization的entity数量:")
for label, count in label_counts_without_norm.items():
    print(f"{label}: {count}")
    


Number of entities: 1250
总entity数量: 1250

每个label的entity数量:
ANAT-DP: 363
OBS-DP: 712
OBS-DA: 93
OBS-U: 82

有normalization的entity数量: 498
没有normalization的entity数量: 752

有normalization的entity占比: 39.84%

每个label的有normalization的entity数量:
ANAT-DP: 152
ANAT-DP的有normalization的entity占比: 41.87%
OBS-DP: 268
OBS-DP的有normalization的entity占比: 37.64%
OBS-DA: 41
OBS-DA的有normalization的entity占比: 44.09%
OBS-U: 37
OBS-U的有normalization的entity占比: 45.12%

每个label的没有normalization的entity数量:
ANAT-DP: 211
OBS-DP: 444
OBS-DA: 52
OBS-U: 45


# First normalization
result in json

In [11]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")


# Count the number of items with normalization
count_with_normalization = sum(1 for data in all_entities.values() if data['normalization'] is not None)
print(f"Number of items with normalization: {count_with_normalization}")

# Count the number of items with normalization: null
count_null_normalization = sum(1 for data in all_entities.values() if data['normalization'] is None)
print(f"Number of items with normalization: null: {count_null_normalization}")


count_no_results = sum(1 for data in all_entities.values() if data['normalization'] and data['normalization']['UMLS'] and data['normalization']['UMLS']['definition']['definition'] == "No results")
print(f"\nNumber of items with normalization.umls.ui == 'No results': {count_no_results}")
print('This means there is even no candidate in UMLS for this entity')


Total number of entities: 1250
Number of items with normalization: 498
Number of items with normalization: null: 752

Number of items with normalization.umls.ui == 'No results': 120
This means there is even no candidate in UMLS for this entity


## Semantic Types Analysis

In [15]:
semantic_types = set()

for entity, data in all_entities.items():
    if data.get('normalization') and data['normalization'].get('UMLS'):
        semantic_types.add(data['normalization']['UMLS']['semanticTypes'])

num_semantic_types = len(semantic_types)

print(f"Number of different unique kinds of items with normalization.umls.semanticTypes: {num_semantic_types}")

print("Unique kinds of items with normalization.umls.semanticTypes:")
print(semantic_types)


# Count the number of each unique kind of item
semantic_type_counts = {}
for semantic_type in semantic_types:
    semantic_type_counts[semantic_type] = sum(1 for entity, data in all_entities.items() if data.get('normalization') and data['normalization'].get('UMLS') and data['normalization']['UMLS']['semanticTypes'] == semantic_type)

# 降序输出 semantic_type_counts，带序号
print("\nSemantic type counts:")
for i, (semantic_type, count) in enumerate(sorted(semantic_type_counts.items(), key=lambda x: x[1], reverse=True), 1):
    print(f"{i}. {semantic_type}: {count}")
    


#  sum of the counts
total_counts = sum(semantic_type_counts.values())
print(f"\nTotal counts: {total_counts}")



Number of different unique kinds of items with normalization.umls.semanticTypes: 46
Unique kinds of items with normalization.umls.semanticTypes:
{'Immunologic Factor', 'Acquired Abnormality', 'Activity', 'Medical Device', 'Intellectual Product', 'Event', 'Pharmacologic Substance', 'Injury or Poisoning', 'Finding', 'Mental or Behavioral Dysfunction', 'Functional Concept', 'Idea or Concept', 'Organic Chemical', 'Substance', 'Diagnostic Procedure', 'Anatomical Abnormality', 'Natural Phenomenon or Process', 'Patient or Disabled Group', 'Organism Function', 'Body Substance', 'Research Activity', 'Phenomenon or Process', 'Chemical Viewed Functionally', 'Pathologic Function', 'Clinical Attribute', 'Molecular Function', 'Congenital Abnormality', 'Anatomical Structure', 'Hazardous or Poisonous Substance', 'Disease or Syndrome', 'Quantitative Concept', 'Conceptual Entity', 'Body Part, Organ, or Organ Component', 'Laboratory or Test Result', 'Qualitative Concept', 'Chemical Viewed Structurally', 

# Entity with no normalizations

In [6]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")

# get list of all entities with normalization: null
entities_null_normalization = [entity for entity, data in all_entities.items() if data['normalization'] is None]
# print(f"Entities with normalization: null: {entities_null_normalization}")

# len(entities_null_normalization)
print(f"Number of entities with normalization: null: {len(entities_null_normalization)}")

Total number of entities: 1250
Number of entities with normalization: null: 752


In [7]:
pprint(entities_null_normalization)

['cardiomediastinal',
 'silhouettes',
 'surfaces',
 'approximately a 4.6 cm',
 'carina',
 'distal',
 'advanced by at least 11 cm',
 'contours',
 'grossly',
 'unremarkable',
 'Pulmonary',
 'apex',
 'slightly',
 "patient 's chin",
 'displaced',
 'osseous',
 'Standard',
 'positioning',
 'mid',
 'portion',
 'body',
 'The side - port',
 'junction',
 'Otherwise',
 'change',
 'stable',
 'ET',
 '4.3 cm',
 'NG',
 'malpositioned',
 'position',
 'enteric',
 'approximately 1.8 cm',
 'barely',
 'side ports',
 'approximately 12 cm',
 'approximately 2 cm',
 'overt',
 'CHF',
 'silhouette',
 'Limited',
 'This',
 'cardiac',
 'mildly',
 'unchanged',
 'bronchovascular',
 'pulmonary',
 'patchy',
 'bibasilar',
 'opacities',
 'abnormalities',
 'Patchy',
 'parenchymal',
 'opacity',
 'frontal',
 'radiograph',
 'scoliosis',
 'asymmetry',
 'ribcage',
 'Mildly',
 'underinflated',
 'largely',
 'Osseous',
 'The lateral view is limited',
 "patient 's arms",
 'lower',
 'They',
 'conspicuous',
 'recent exam from ___',

# CSV

In [23]:
# import csv

# # Specify the file path
# csv_file_path = '../output.csv'

# # Extract the required fields from all_entities
# data = []

# for key, value in all_entities.items():
#     name = key
#     ui = None
#     definition = None
#     semanticTypes = None
#     normalized_name = None

#     if value.get('normalization') and value['normalization'].get('UMLS'):
#         umls = value['normalization']['UMLS']
#         ui = umls.get('ui')
#         normalized_name = umls.get('name')
#         semanticTypes = umls.get('semanticTypes')
#         definition = umls.get('definition', {}).get('definition')

#     data.append({
#         'name': name,
#         'ui': ui,
#         'normalized_name': normalized_name,
#         'semanticTypes': semanticTypes,
#         'definition': definition
#     })


# # Write the data to the CSV file
# with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
#     fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerows(data)

# print(f"Data has been written to {csv_file_path}")

Data has been written to ../output.csv


In [5]:
# import csv

# # Specify the file path
# csv_file_path = '../gpt_normalization.csv'



# import json
# from pprint import pprint

# file_path = '../resource/all_unique_entities_normalized.json'

# # Read the JSON file
# with open(file_path, 'r', encoding='utf-8') as f:
#     all_entities = json.load(f)

# # get list of all entities with normalization: null
# entities_null_normalization = [entity for entity, data in all_entities.items() if data['normalization'] is None]


# data = []

# import sys
# import os
# from pprint import pprint

# # Add the parent directory to the Python path
# parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
# sys.path.append(parent_dir)

# from utils.umls_api import normalize_entity 
# from utils.umls_api import use_umls_api_term

# for entity in entities_null_normalization:
    
#     response = use_umls_api_term(entity)
#     result_list = response['result']['results'][:5]

#     ui, name = normalize_entity(entity, result_list)

#     data.append({
#         'name': entity,
#         'ui': ui,
#         'normalized_name': name,
#     })



# # Write the data to the CSV file
# with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
#     fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerows(data)

# print(f"Data has been written to {csv_file_path}")

Data has been written to ../gpt_normalization.csv


In [None]:
import csv

# Specify the file path
csv_file_path = '../gpt_normalization.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

# Print the data
# print(data)


count_normalizable = sum(1 for entry in data if entry['ui'] != 'unnormalizable')
print(f"Number of entries with 'normalizable' ui: {count_normalizable}")


normalizable = [entry for entry in data if entry['ui'] != 'unnormalizable']
for entry in normalizable:
    print(entry)



# Analysis after using the normalization with GPT

In [2]:
import json
from pprint import pprint

file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# Print total number of entities
print(f"Total number of entities: {len(all_entities)}")
# get list of all entities with normalization: null
entities_null_normalization = [entity for entity, data in all_entities.items() if data['normalization'] is None]
# len(entities_null_normalization)
print(f"Number of entities with normalization: null: {len(entities_null_normalization)}")


import csv

# Specify the file path
csv_file_path = '../resource/gpt_normalization.csv'

# Read the data from the CSV file
data = []
with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)


count_normalizable = sum(1 for entry in data if entry['ui'] != 'unnormalizable')
print(f"Number of entries with 'normalizable' ui: {count_normalizable}")


# we have stiil 752 - 390 = 362 entities that are not normalizable
print(f"we still have {len(entities_null_normalization) - count_normalizable} entities that are not normalizable")

Total number of entities: 1250
Number of entities with normalization: null: 752
Number of entries with 'normalizable' ui: 390
we still have 362 entities that are not normalizable


# Second Normalization

In [14]:
import csv

file_path = '../resource/output.csv'

# Read the data from the CSV file
data = []
with open(file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

# # check how many entities have no ui
# count_no_ui = sum(1 for entry in data if entry['ui'] == '')
# print(f"Number of entries with 'None' ui: {count_no_ui}")

no_ui = [entry for entry in data if entry['ui'] == '']
# for entry in no_ui:
    # print(entry)


print(f"Number of entries with 'None' ui: {len(no_ui)}")
# total 1250
# we have 1250 - 390 = 860 entities that are normalizable
print(f"we have {len(data) - len(no_ui)} entities that are normalizable")

Number of entries with 'None' ui: 362
we have 888 entities that are normalizable


In [None]:
file_path = '../resource/not_normalized.csv'

# Write the data to the CSV file
with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(no_ui)

print(f"Data has been written to {file_path}")

In [None]:
import csv
import json

file_path = '../resource/output.csv'

# Read the data from the CSV file
data = []
with open(file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

# add a new column to the data, report.
for entry in data:
    entry['report'] = ''

    # ../../resource/all_unique_entities.json
    # Read the JSON file
    with open('../resource/all_unique_entities.json', 'r', encoding='utf-8') as f:
        all_entities = json.load(f)

    # all_entities[entry['name']]['reports'][0] is a dict, get the key
    report_file = all_entities[entry['name']]['reports'][0].keys()



    

In [31]:
# import csv
# import json
# from pprint import pprint

# file_path = '../resource/output.csv'

# # Read the data from the CSV file
# data = []
# with open(file_path, 'r', encoding='utf-8') as csvfile:
#     reader = csv.DictReader(csvfile)
#     for row in reader:
#         data.append(row)

# path = '/DATA1/llm-research/RadGraph/physionet.org/files/radgraph/1.0.0/train.json'

# with open(path, 'r') as f:
#     train_data = json.load(f)

# # ../../resource/all_unique_entities.json
# # Read the JSON file
# with open('../../resource/all_unique_entities.json', 'r', encoding='utf-8') as f:
#     all_entities = json.load(f)


# # add a new column to the data, report.
# # entry = data[0]
# for entry in data:
#     entry['report'] = ''

#     # all_entities[entry['name']]['reports'][0] is a dict, get the key
#     report_file = list(all_entities[entry['name']]['reports'][0].keys())[0]

#     entry['report'] = train_data[report_file]['text']

# file_path = '../resource/output_with_report.csv'


# # write the data back to the CSV file
# with open(file_path, 'w', newline='', encoding='utf-8') as csvfile:
#     fieldnames = ['name', 'ui', 'normalized_name', 'semanticTypes', 'definition', 'report']
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerows(data)

# print(f"Data has been written to {file_path}")
    

Data has been written to ../resource/output_with_report.csv


In [5]:
import csv
import json
from pprint import pprint

file_path = '../resource/output.csv'

# Read the data from the CSV file
data = []
with open(file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        data.append(row)

# count the number of entities with no normalization
count_no_norm = sum(1 for entry in data if entry['ui'] == 'unnormalizable') 
print(f"Number of entries with 'unnormalizable' ui: {count_no_norm}")

Number of entries with 'unnormalizable' ui: 0


# Excel

In [1]:
# import pandas as pd

# # Read the data from the CSV file
# data = pd.read_csv('../resource/output.csv')

# # Write the data to an Excel file
# data.to_excel('../resource/output.xlsx', index=False)

# print("Data has been written to output.xlsx")

Data has been written to output.xlsx


## subt set of entities e.g. first 199 

In [4]:
import pandas as pd

# Read the first 200 data from the CSV file
data = pd.read_csv('../resource/output_with_report.csv')

# print(data[200:400])

# Write the first 201-400 data to an Excel file
# data.head(200).to_excel('../resource/output_200.xlsx', index=False)
data[200:400].to_excel('../resource/output_201_400.xlsx', index=False)


# print("Data has been written to output_200.xlsx")
print("Data has been written to output_201_400.xlsx")


Data has been written to output_201_400.xlsx


## Analysis of the first results from radiology expert
105/200, for sure


In [8]:
import pandas as pd

# Read the Excel file
data = pd.read_excel('../resource/output_200-verified_hb.xlsx')

# Print the data
# print(data)


# data with YES
subset_data = data[data['Radiology Expert verify'] == 'Yes']
print(f"Number of entities with 'Radiology Expert verify' == 'Yes': {len(subset_data)}")
# data with NO
subset_data = data[data['Radiology Expert verify'] == 'No']
print(f"Number of entities with 'Radiology Expert verify' == 'No': {len(subset_data)}")
# extract subset of data, 'Radiology Expert verify'.isnull()
subset_data = data[data['Radiology Expert verify'].isnull()]
print(f"Number of entities with 'Radiology Expert verify' == NAN: {len(subset_data)}\n")



# extract subset of data, 'Radiology Expert verify' == 'YES' and 'Comment' == NAN
subset_data = data[(data['Radiology Expert verify'] == 'Yes') & (data['Comment'].isnull())]
# print(len(subset_data))
print(f"Number of entities with 'Radiology Expert verify' == 'Yes' and 'Comment' == NAN: {len(subset_data)}\n")
print("This is very confident data")



# data with comment
subset_data = data[data['Comment'].notnull()]
print(f"Number of entities with 'Comment' != NAN: {len(subset_data)}")


# data with NO
subset_data = data[data['Radiology Expert verify'] == 'No']
print(f"Number of entities with 'Radiology Expert verify' == 'No': {len(subset_data)}")


# extract subset of data, 'Radiology Expert verify'.isnull()
subset_data = data[data['Radiology Expert verify'].isnull()]
print(f"Number of entities with 'Radiology Expert verify' == NAN: {len(subset_data)}")

Number of entities with 'Radiology Expert verify' == 'Yes': 122
Number of entities with 'Radiology Expert verify' == 'No': 1
Number of entities with 'Radiology Expert verify' == NAN: 76

Number of entities with 'Radiology Expert verify' == 'Yes' and 'Comment' == NAN: 105

This is very confident data
Number of entities with 'Comment' != NAN: 48
Number of entities with 'Radiology Expert verify' == 'No': 1
Number of entities with 'Radiology Expert verify' == NAN: 76


In [15]:
# extract subset of data, 'Radiology Expert verify' != 'YES' or 'Comment' != NAN
subset_data = data[(data['Radiology Expert verify'] != 'Yes') | (data['Comment'].notnull())]
print(f"Number of entities with 'Radiology Expert verify' != 'Yes' or 'Comment' != NAN: {len(subset_data)}")

# wirte the data to a new Excel file
subset_data.to_excel('../resource/output_200_review.xlsx', index=False)

Number of entities with 'Radiology Expert verify' != 'Yes' or 'Comment' != NAN: 95
