Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add dicom fields scrapping script #62

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ env
__pycache__
.vscode
build
*.egg-info
*.egg-info
.python-version
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ dependencies = [
dev = [
"pytest",
"setuptools", # Needed to load pydicom's test files
"bs4",
"fire",
"requests"
]

[project.scripts]
Expand Down
12 changes: 12 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Script folder

This folder contains utility scripts for the maintenance of the package.

## scrap_DICOM_fields.py

This script downloads a web page and tries to scrap the DICOM fields and their anonymization command from it.

1. Pull the repository: `git clone https://github.com/KitwareMedical/dicom-anonymizer.git`
1. Go in the repository: `cd dicom-anonymizer`
1. Install the dependencies: `pip install -e '.[dev]'`
1. Run the script: `python scripts/scrap_DICOM_fields.py` (Run it with `-h` to get a list of arguments)
111 changes: 111 additions & 0 deletions scripts/scrap_DICOM_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""
Download a web page and try to scrap the DICOM fields and their anonymization command from it.

Written by Mohammad Khawar Zia
"""

import fire
import requests

from collections import defaultdict
from bs4 import BeautifulSoup


dicom_fields_header = """# Tags anonymized in DICOM standard
# Documentation for groups meaning can be found in default associated actions.
# https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html

"""

dicom_fields_footer = """# Contains all previous tags into one array
ALL_TAGS = []
ALL_TAGS.extend(D_TAGS)
ALL_TAGS.extend(Z_TAGS)
ALL_TAGS.extend(X_TAGS)
ALL_TAGS.extend(U_TAGS)
ALL_TAGS.extend(Z_D_TAGS)
ALL_TAGS.extend(X_Z_TAGS)
ALL_TAGS.extend(X_D_TAGS)
ALL_TAGS.extend(X_Z_D_TAGS)
ALL_TAGS.extend(X_Z_U_STAR_TAGS)
"""


def scrap_profiles(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")

headers = [th.text for th in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('thead').find_all('strong')]
data = []


for tr in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('tbody').find_all('tr'):
tmp = {key: value.text.strip() for key, value in dict(zip(headers, tr.find_all('td'))).items() if key in ['Attribute Name', 'Tag', 'Basic Prof.']}
tmp2 = (tmp.get('Tag'), tmp.get('Attribute Name'), tmp.get('Basic Prof.'))
data.append(tmp2)

data = sorted(data, key=lambda ele: (ele[2], ele[1]))


profiles = defaultdict(list)
fields_to_skip = {
'Private Attributes',
}
for tag, name, profile in data:
if name in fields_to_skip:
continue

if name == 'Curve Data':
new_tag = '(0x5000, 0x0000, 0xFF00, 0x0000)'
elif name == 'Overlay Comments':
new_tag = '(0x6000, 0x4000, 0xFF00, 0xFFFF)'
elif name == 'Overlay Data':
new_tag = '(0x6000, 0x3000, 0xFF00, 0xFFFF)'
else:
new_tag = list(tag)
new_tag.insert(6, '0x')
new_tag.insert(6, ' ')
new_tag.insert(1, '0x')
new_tag = ''.join(new_tag)

name = name.replace('\u200b', '').replace('\n', '')
string = f'{new_tag}, # {name}'
profiles[profile].append(string)

return profiles


def create_DICOM_fields(profiles):
dicom_fields = ""
for tag, tag_list, comment in (
('D', 'D_TAGS', '# Replaced tags'),
('Z', 'Z_TAGS', "# Replaced with empty values (0, '', ...)"),
('X', 'X_TAGS', '# Deleted tags'),
('U', 'U_TAGS', '# Replace UID'),

('Z/D', 'Z_D_TAGS', '# Replace element according to the VR'),
('X/Z', 'X_Z_TAGS', '# Set the value to empty according to the VR'),
('X/D', 'X_D_TAGS', "# Replace element according to the VR"),

('X/Z/D', 'X_Z_D_TAGS', '# Replace element according to the VR'),
('X/Z/U*', 'X_Z_U_STAR_TAGS',
'# Replace element with UI as VR, else replace according to VR with empty values'),
):
dicom_fields += f'{comment}\n{tag_list} = [\n'
for profile in profiles.get(tag):
dicom_fields += f' {profile}\n'
dicom_fields += ']\n\n'

return dicom_fields_header + dicom_fields + dicom_fields_footer


def main(
url="https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html",
output_path='dicomanonymizer/dicomfields.py'):
profiles = scrap_profiles(url)
file_content = create_DICOM_fields(profiles=profiles)
with open(output_path, 'w') as file:
file.write(file_content)

if __name__ == '__main__':
fire.Fire(main)