Skip to content

Commit

Permalink
Merge pull request #62 from pchoisel/add_dicom_fields_scrapping_script
Browse files Browse the repository at this point in the history
ENH: Add dicom fields scrapping script
  • Loading branch information
pchoisel committed Feb 26, 2024
2 parents 75a4aba + e4a5429 commit 3ffe350
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 1 deletion.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ env
__pycache__
.vscode
build
*.egg-info
*.egg-info
.python-version
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ dependencies = [
dev = [
"pytest",
"setuptools", # Needed to load pydicom's test files
"bs4",
"fire",
"requests"
]

[project.scripts]
Expand Down
12 changes: 12 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Script folder

This folder contains utility scripts for the maintenance of the package.

## scrap_DICOM_fields.py

This script downloads a web page and tries to scrap the DICOM fields and their anonymization command from it.

1. Pull the repository: `git clone https://github.com/KitwareMedical/dicom-anonymizer.git`
1. Go in the repository: `cd dicom-anonymizer`
1. Install the dependencies: `pip install -e '.[dev]'`
1. Run the script: `python scripts/scrap_DICOM_fields.py` (Run it with `-h` to get a list of arguments)
111 changes: 111 additions & 0 deletions scripts/scrap_DICOM_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Download a web page and try to scrap the DICOM fields and their anonymization command from it.
Written by Mohammad Khawar Zia
"""

import fire
import requests

from collections import defaultdict
from bs4 import BeautifulSoup


dicom_fields_header = """# Tags anonymized in DICOM standard
# Documentation for groups meaning can be found in default associated actions.
# https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html
"""

dicom_fields_footer = """# Contains all previous tags into one array
ALL_TAGS = []
ALL_TAGS.extend(D_TAGS)
ALL_TAGS.extend(Z_TAGS)
ALL_TAGS.extend(X_TAGS)
ALL_TAGS.extend(U_TAGS)
ALL_TAGS.extend(Z_D_TAGS)
ALL_TAGS.extend(X_Z_TAGS)
ALL_TAGS.extend(X_D_TAGS)
ALL_TAGS.extend(X_Z_D_TAGS)
ALL_TAGS.extend(X_Z_U_STAR_TAGS)
"""


def scrap_profiles(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")

headers = [th.text for th in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('thead').find_all('strong')]
data = []


for tr in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('tbody').find_all('tr'):
tmp = {key: value.text.strip() for key, value in dict(zip(headers, tr.find_all('td'))).items() if key in ['Attribute Name', 'Tag', 'Basic Prof.']}
tmp2 = (tmp.get('Tag'), tmp.get('Attribute Name'), tmp.get('Basic Prof.'))
data.append(tmp2)

data = sorted(data, key=lambda ele: (ele[2], ele[1]))


profiles = defaultdict(list)
fields_to_skip = {
'Private Attributes',
}
for tag, name, profile in data:
if name in fields_to_skip:
continue

if name == 'Curve Data':
new_tag = '(0x5000, 0x0000, 0xFF00, 0x0000)'
elif name == 'Overlay Comments':
new_tag = '(0x6000, 0x4000, 0xFF00, 0xFFFF)'
elif name == 'Overlay Data':
new_tag = '(0x6000, 0x3000, 0xFF00, 0xFFFF)'
else:
new_tag = list(tag)
new_tag.insert(6, '0x')
new_tag.insert(6, ' ')
new_tag.insert(1, '0x')
new_tag = ''.join(new_tag)

name = name.replace('\u200b', '').replace('\n', '')
string = f'{new_tag}, # {name}'
profiles[profile].append(string)

return profiles


def create_DICOM_fields(profiles):
dicom_fields = ""
for tag, tag_list, comment in (
('D', 'D_TAGS', '# Replaced tags'),
('Z', 'Z_TAGS', "# Replaced with empty values (0, '', ...)"),
('X', 'X_TAGS', '# Deleted tags'),
('U', 'U_TAGS', '# Replace UID'),

('Z/D', 'Z_D_TAGS', '# Replace element according to the VR'),
('X/Z', 'X_Z_TAGS', '# Set the value to empty according to the VR'),
('X/D', 'X_D_TAGS', "# Replace element according to the VR"),

('X/Z/D', 'X_Z_D_TAGS', '# Replace element according to the VR'),
('X/Z/U*', 'X_Z_U_STAR_TAGS',
'# Replace element with UI as VR, else replace according to VR with empty values'),
):
dicom_fields += f'{comment}\n{tag_list} = [\n'
for profile in profiles.get(tag):
dicom_fields += f' {profile}\n'
dicom_fields += ']\n\n'

return dicom_fields_header + dicom_fields + dicom_fields_footer


def main(
url="https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html",
output_path='dicomanonymizer/dicomfields.py'):
profiles = scrap_profiles(url)
file_content = create_DICOM_fields(profiles=profiles)
with open(output_path, 'w') as file:
file.write(file_content)

if __name__ == '__main__':
fire.Fire(main)

0 comments on commit 3ffe350

Please sign in to comment.