# Read file all_unique_entities.json

In [24]:
import json
from pprint import pprint

# Specify the file path
file_path = '../resource/all_unique_entities.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")

# pretty print the fist 5 entities
print("First 5 items:")
for i, (key, value) in enumerate(all_entities.items()):
    if i >= 5:
        break
    pprint({key:value}, width=100)




Number of entities: 1250
First 5 items:
{'Lungs': {'label': 'ANAT-DP',
           'normalization': None,
           'reports': [{'p18/p18004941/s58821758.txt': {'end_ix': 36, 'start_ix': 36}},
                       {'p18/p18003081/s53302126.txt': {'end_ix': 52, 'start_ix': 52}},
                       {'p18/p18001922/s52638004.txt': {'end_ix': 46, 'start_ix': 46}},
                       {'p18/p18001922/s52288833.txt': {'end_ix': 52, 'start_ix': 52}},
                       {'p15/p15165563/s51659523.txt': {'end_ix': 110, 'start_ix': 110}},
                       {'p15/p15004061/s56845046.txt': {'end_ix': 42, 'start_ix': 42}},
                       {'p15/p15078112/s51228277.txt': {'end_ix': 75, 'start_ix': 75}},
                       {'p15/p15078112/s58703686.txt': {'end_ix': 66, 'start_ix': 66}},
                       {'p18/p18026902/s53920289.txt': {'end_ix': 63, 'start_ix': 63}},
                       {'p18/p18026902/s51741672.txt': {'end_ix': 64, 'start_ix': 64}},
             

# Simple Analysis

In [25]:
from collections import Counter

# 统计每个label的entity数量
label_counts = Counter()
for entity, data in all_entities.items():
    label_counts[data['label']] += 1

# 统计总的entity数量
total_entities = len(all_entities)

# 找出出现次数最多的entity
most_common_entity = max(all_entities.items(), key=lambda x: len(x[1]['reports']))

# 计算平均报告数量
avg_reports = sum(len(data['reports']) for data in all_entities.values()) / total_entities

print(f"总entity数量: {total_entities}")
print("\n每个label的entity数量:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

print(f"\n出现次数最多的entity: '{most_common_entity[0]}' (出现在 {len(most_common_entity[1]['reports'])} 个报告中)")

# 统计有normalization和没有normalization的entity数量
with_norm = sum(1 for data in all_entities.values() if data['normalization'] is not None)
without_norm = total_entities - with_norm

print(f"\n有normalization的entity数量: {with_norm}")
print(f"没有normalization的entity数量: {without_norm}")


总entity数量: 1250

每个label的entity数量:
ANAT-DP: 363
OBS-DP: 712
OBS-DA: 93
OBS-U: 82

出现次数最多的entity: 'pleural' (出现在 424 个报告中)

有normalization的entity数量: 0
没有normalization的entity数量: 1250


# Read file all_unique_entities.json

In [27]:
import json
from pprint import pprint

# Specify the file path
file_path = '../resource/all_unique_entities_normalized.json'

# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
    all_entities = json.load(f)

# print the number of entities
print(f"Number of entities: {len(all_entities)}")





from collections import Counter

# 统计每个label的entity数量
label_counts = Counter()
for entity, data in all_entities.items():
    label_counts[data['label']] += 1

# 统计总的entity数量
total_entities = len(all_entities)

# 找出出现次数最多的entity
most_common_entity = max(all_entities.items(), key=lambda x: len(x[1]['reports']))

# 计算平均报告数量
avg_reports = sum(len(data['reports']) for data in all_entities.values()) / total_entities

print(f"总entity数量: {total_entities}")
print("\n每个label的entity数量:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


# 统计有normalization和没有normalization的entity数量
with_norm = sum(1 for data in all_entities.values() if data['normalization'] is not None)
without_norm = total_entities - with_norm

print(f"\n有normalization的entity数量: {with_norm}")
print(f"没有normalization的entity数量: {without_norm}")
# percentage of entities with normalization
print(f"\n有normalization的entity占比: {with_norm / total_entities:.2%}")


# 统计每个label的有normalization和没有normalization的entity数量
label_counts_with_norm = Counter()
label_counts_without_norm = Counter()
for entity, data in all_entities.items():
    if data['normalization'] is not None:
        label_counts_with_norm[data['label']] += 1
    else:
        label_counts_without_norm[data['label']] += 1

print("\n每个label的有normalization的entity数量:")
for label, count in label_counts_with_norm.items():
    print(f"{label}: {count}")
    # percentage of entities with normalization
    print(f"{label}的有normalization的entity占比: {count / label_counts[label]:.2%}")


print("\n每个label的没有normalization的entity数量:")
for label, count in label_counts_without_norm.items():
    print(f"{label}: {count}")
    


Number of entities: 1250
总entity数量: 1250

每个label的entity数量:
ANAT-DP: 363
OBS-DP: 712
OBS-DA: 93
OBS-U: 82

有normalization的entity数量: 498
没有normalization的entity数量: 752

有normalization的entity占比: 39.84%

每个label的有normalization的entity数量:
ANAT-DP: 152
ANAT-DP的有normalization的entity占比: 41.87%
OBS-DP: 268
OBS-DP的有normalization的entity占比: 37.64%
OBS-DA: 41
OBS-DA的有normalization的entity占比: 44.09%
OBS-U: 37
OBS-U的有normalization的entity占比: 45.12%

每个label的没有normalization的entity数量:
ANAT-DP: 211
OBS-DP: 444
OBS-DA: 52
OBS-U: 45
