## 1. Import Required Libraries

In [2]:
pip install pydicom tqdm

Collecting pydicom
  Using cached pydicom-3.0.1-py3-none-any.whl (2.4 MB)
Installing collected packages: pydicom
Successfully installed pydicom-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pydicom
from pydicom.data import get_testdata_files
import os
from pathlib import Path
import shutil
from datetime import datetime
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import hashlib
import json
from tqdm import tqdm

# Display warnings
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully!")

✓ Libraries imported successfully!


## 2. Load DICOM Files from Directory

In [None]:
def find_dicom_files(directory: str) -> List[str]:
    """
    Recursively find all DICOM files in a directory.
    
    Args:
        directory: Root directory to search for DICOM files
        
    Returns:
        List of absolute paths to DICOM files
    """
    dicom_files = []
    directory_path = Path(directory)
    
    if not directory_path.exists():
        print(f"❌ Directory not found: {directory}")
        return dicom_files
    
    # Search for .dcm files recursively
    for dcm_file in directory_path.rglob('*.dcm'):
        dicom_files.append(str(dcm_file))
    
    print(f"✓ Found {len(dicom_files)} DICOM files in {directory}")
    return sorted(dicom_files)


def load_dicom(file_path: str):
    """
    Load a single DICOM file.
    
    Args:
        file_path: Path to DICOM file
        
    Returns:
        pydicom Dataset object or None if failed
    """
    try:
        dicom_file = pydicom.dcmread(file_path)
        return dicom_file
    except Exception as e:
        print(f"[+] Error loading {file_path}: {str(e)}")
        return None


# Test loading
data_dir = r"c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train"
dicom_files = find_dicom_files(data_dir)

if dicom_files:
    print(f"\nFirst 5 DICOM files found:")
    for f in dicom_files[:5]:
        print(f"  - {f}")

✓ Found 250 DICOM files in c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train

First 5 DICOM files found:
  - c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train\no_pneumothorax\000000.dcm
  - c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train\no_pneumothorax\000002.dcm
  - c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train\no_pneumothorax\000005.dcm
  - c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train\no_pneumothorax\000006.dcm
  - c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train\no_pneumothorax\000007.dcm


## 3. Extract DICOM Metadata

In [6]:
def extract_metadata(dicom_file) -> Dict:
    """
    Extract all metadata from a DICOM file.
    
    Args:
        dicom_file: pydicom Dataset object
        
    Returns:
        Dictionary of metadata tags and values
    """
    metadata = {}
    
    try:
        for tag, value in dicom_file.items():
            tag_name = tag.name if hasattr(tag, 'name') else str(tag)
            try:
                # Try to convert value to string
                if hasattr(value, 'value'):
                    metadata[tag_name] = str(value.value)
                else:
                    metadata[tag_name] = str(value)[:100]  # Limit length
            except:
                metadata[tag_name] = "<Unable to parse>"
    except Exception as e:
        print(f"Error extracting metadata: {str(e)}")
    
    return metadata


def display_metadata(dicom_file, limit: int = 20):
    """
    Display DICOM metadata in a readable format.
    
    Args:
        dicom_file: pydicom Dataset object
        limit: Number of tags to display
    """
    print("\n" + "="*80)
    print("DICOM METADATA TAGS")
    print("="*80)
    
    for i, (tag, value) in enumerate(dicom_file.items()):
        if i >= limit:
            print(f"... and {len(dicom_file) - limit} more tags")
            break
        
        tag_name = dicom_file[tag].name if hasattr(dicom_file[tag], 'name') else str(tag)
        tag_value = str(value.value)[:80] if hasattr(value, 'value') else str(value)[:80]
        print(f"{tag_name:40} | {tag_value}")


# Display metadata from first DICOM file
if dicom_files:
    first_dicom = load_dicom(dicom_files[0])
    if first_dicom:
        display_metadata(first_dicom)
        print(f"\n✓ Total tags in this DICOM: {len(first_dicom)}")


DICOM METADATA TAGS
Specific Character Set                   | ISO_IR 100
SOP Class UID                            | b'1.2.840.10008.5.1.4.1.1.7\x00'
SOP Instance UID                         | b'1.2.276.0.7230010.3.1.4.8323329.6904.1517875201.850819'
Study Date                               | b'19010101'
Study Time                               | b'000000.00 '
Accession Number                         | b''
Modality                                 | b'CR'
Conversion Type                          | b'WSD '
Referring Physician's Name               | b''
Series Description                       | b'view: PA'
Patient's Name                           | b'16d7f894-55d7-4d95-8957-d18987f0e981'
Patient ID                               | b'16d7f894-55d7-4d95-8957-d18987f0e981'
Patient's Birth Date                     | b''
Patient's Sex                            | b'M '
Patient's Age                            | b'62'
Body Part Examined                       | b'CHEST '
View Position          

## 4. Define Anonymization Rules

In [7]:
# Define comprehensive anonymization rules
ANONYMIZATION_RULES = {
    # Remove completely - Patient Information
    'remove': [
        'PatientName',
        'PatientID',
        'PatientBirthDate',
        'PatientAge',
        'PatientSex',
        'PatientAddress',
        'PatientTelephoneNumbers',
        'ReferencedPatientSequence',
        'OtherPatientNames',
        'OtherPatientIDSequence',
        
        # Remove - Study/Series Information
        'StudyInstanceUID',
        'SeriesInstanceUID',
        'StudyDate',
        'SeriesDate',
        'ContentDate',
        'StudyTime',
        'SeriesTime',
        'ContentTime',
        'AcquisitionDateTime',
        'StudyDescription',
        'SeriesDescription',
        'SeriesNumber',
        'StudyComments',
        'SeriesComments',
        
        # Remove - Physician/Operator Information
        'ReferringPhysicianName',
        'PhysicianOfRecord',
        'PerformingPhysicianName',
        'OperatorsName',
        'InstitutionName',
        'InstitutionAddress',
        'InstitutionalDepartmentName',
        'Manufacturer',
        'ManufacturerModelName',
        'DeviceSerialNumber',
        'StationName',
        
        # Remove - Other identifiers
        'Accession Number',
        'RequestingPhysician',
        'CommentsOnRadiationDose',
        'ReferencedStudySequence',
        'ReferencedImageSequence',
    ],
    
    # Replace with placeholder
    'replace': {
        'PatientName': 'ANONYMIZED',
        'PatientID': 'ANON-ID-001',
        'ReferringPhysicianName': 'ANONYMIZED',
        'PerformingPhysicianName': 'ANONYMIZED',
        'OperatorsName': 'ANONYMIZED',
        'InstitutionName': 'ANONYMIZED',
    },
    
    # Modify dates to year only
    'date_shift': {
        'StudyDate': True,
        'SeriesDate': True,
        'PatientBirthDate': True,
        'ContentDate': True,
        'AcquisitionDate': True,
    }
}

print("✓ Anonymization rules defined:")
print(f"  - Tags to remove: {len(ANONYMIZATION_RULES['remove'])}")
print(f"  - Tags to replace: {len(ANONYMIZATION_RULES['replace'])}")
print(f"  - Dates to shift: {len(ANONYMIZATION_RULES['date_shift'])}")

✓ Anonymization rules defined:
  - Tags to remove: 40
  - Tags to replace: 6
  - Dates to shift: 5


## 5. Anonymize Single DICOM Image

In [8]:
def anonymize_dicom(dicom_file, rules: Dict = None) -> Tuple[bool, str]:
    """
    Anonymize a DICOM file according to defined rules.
    
    Args:
        dicom_file: pydicom Dataset object
        rules: Anonymization rules dictionary
        
    Returns:
        Tuple of (success: bool, message: str)
    """
    if rules is None:
        rules = ANONYMIZATION_RULES
    
    try:
        anonymized_count = 0
        
        # 1. Remove tags completely
        for tag in rules['remove']:
            if tag in dicom_file:
                del dicom_file[tag]
                anonymized_count += 1
        
        # 2. Replace sensitive tags with placeholders
        for tag, replacement_value in rules['replace'].items():
            if tag in dicom_file:
                dicom_file[tag].value = replacement_value
                anonymized_count += 1
        
        # 3. Shift dates (keep year only)
        for tag in rules['date_shift']:
            if tag in dicom_file:
                try:
                    current_value = str(dicom_file[tag].value)
                    if len(current_value) >= 8:  # YYYYMMDD format
                        # Keep only year
                        year = current_value[:4]
                        dicom_file[tag].value = year + '0101'  # Set to Jan 1 of that year
                        anonymized_count += 1
                except:
                    pass
        
        # 4. Remove private tags (manufacturer-specific)
        tags_to_remove = [tag for tag in dicom_file.keys() if tag.is_private]
        for tag in tags_to_remove:
            del dicom_file[tag]
            anonymized_count += 1
        
        # 5. Generate new UIDs to prevent identification
        if 'SOPInstanceUID' in dicom_file:
            dicom_file.SOPInstanceUID = pydicom.uid.generate_uid()
        if 'StudyInstanceUID' in dicom_file:
            dicom_file.StudyInstanceUID = pydicom.uid.generate_uid()
        if 'SeriesInstanceUID' in dicom_file:
            dicom_file.SeriesInstanceUID = pydicom.uid.generate_uid()
        
        return True, f"✓ Anonymized {anonymized_count} tags"
    
    except Exception as e:
        return False, f" Error during anonymization: {str(e)}"


# Test single file anonymization
if dicom_files:
    test_file = load_dicom(dicom_files[0])
    if test_file:
        print("BEFORE ANONYMIZATION:")
        print(f"Total tags: {len(test_file)}")
        if 'PatientName' in test_file:
            print(f"Patient Name: {test_file.PatientName}")
        if 'PatientID' in test_file:
            print(f"Patient ID: {test_file.PatientID}")
        
        success, message = anonymize_dicom(test_file)
        print(f"\n{message}")
        
        print("\nAFTER ANONYMIZATION:")
        print(f"Total tags: {len(test_file)}")
        if 'PatientName' in test_file:
            print(f"Patient Name: {test_file.PatientName}")
        if 'PatientID' in test_file:
            print(f"Patient ID: {test_file.PatientID}")

BEFORE ANONYMIZATION:
Total tags: 35
Patient Name: 16d7f894-55d7-4d95-8957-d18987f0e981
Patient ID: 16d7f894-55d7-4d95-8957-d18987f0e981

✓ Anonymized 12 tags

AFTER ANONYMIZATION:
Total tags: 23


## 6. Batch Anonymize Multiple DICOM Files

In [11]:
def batch_anonymize_dicom(input_dir: str, output_dir: str, rules: Dict = None) -> Dict:
    """
    Batch anonymize all DICOM files in a directory.
    
    Args:
        input_dir: Input directory containing DICOM files
        output_dir: Output directory for anonymized files
        rules: Anonymization rules dictionary
        
    Returns:
        Dictionary with processing statistics
    """
    if rules is None:
        rules = ANONYMIZATION_RULES
    
    # Create output directory
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    # Find all DICOM files
    dicom_files = find_dicom_files(input_dir)
    
    if not dicom_files:
        print("[+] No DICOM files found!")
        return {'status': 'failed', 'reason': 'No DICOM files found'}
    
    statistics = {
        'total_files': len(dicom_files),
        'successful': 0,
        'failed': 0,
        'total_tags_removed': 0,
        'errors': [],
        'output_dir': output_dir
    }
    
    print(f"\nProcessing {len(dicom_files)} DICOM files...\n")
    
    # Process each file
    for idx, file_path in enumerate(tqdm(dicom_files, desc="Anonymizing"), 1):
        try:
            # Load DICOM
            dicom_data = load_dicom(file_path)
            if dicom_data is None:
                statistics['failed'] += 1
                statistics['errors'].append(f"Failed to load: {file_path}")
                continue
            
            # Get original tag count
            original_tag_count = len(dicom_data)
            
            # Anonymize
            success, message = anonymize_dicom(dicom_data, rules)
            if not success:
                statistics['failed'] += 1
                statistics['errors'].append(f"{file_path}: {message}")
                continue
            
            # Save anonymized DICOM
            relative_path = Path(file_path).relative_to(input_dir)
            output_file_path = output_path / relative_path
            output_file_path.parent.mkdir(parents=True, exist_ok=True)
            
            dicom_data.save_as(str(output_file_path), write_like_original=False)
            
            statistics['successful'] += 1
            statistics['total_tags_removed'] += (original_tag_count - len(dicom_data))
        
        except Exception as e:
            statistics['failed'] += 1
            statistics['errors'].append(f"{file_path}: {str(e)}")
    
    return statistics


print("✓ Batch anonymization function ready")
print("  Usage: stats = batch_anonymize_dicom(input_dir, output_dir)")

✓ Batch anonymization function ready
  Usage: stats = batch_anonymize_dicom(input_dir, output_dir)


## 7. Verify Anonymization Results

In [13]:
def verify_anonymization(original_file_path: str, anonymized_file_path: str) -> Dict:
    """
    Compare original and anonymized DICOM files.
    
    Args:
        original_file_path: Path to original DICOM
        anonymized_file_path: Path to anonymized DICOM
        
    Returns:
        Verification results dictionary
    """
    original = load_dicom(original_file_path)
    anonymized = load_dicom(anonymized_file_path)
    
    if original is None or anonymized is None:
        return {'status': 'error', 'message': 'Could not load files'}
    
    # Sensitive tags to check
    sensitive_tags = [
        'PatientName', 'PatientID', 'PatientBirthDate',
        'ReferringPhysicianName', 'OperatorsName', 'InstitutionName',
        'StudyInstanceUID', 'SeriesInstanceUID'
    ]
    
    verification = {
        'sensitive_tags_removed': [],
        'sensitive_tags_still_present': [],
        'total_tags_original': len(original),
        'total_tags_anonymized': len(anonymized),
        'tags_removed': len(original) - len(anonymized),
        'pixel_data_present': 'PixelData' in anonymized,
        'image_dimensions': None
    }
    
    # Check sensitive tags
    for tag in sensitive_tags:
        if tag in anonymized:
            verification['sensitive_tags_still_present'].append(tag)
        else:
            verification['sensitive_tags_removed'].append(tag)
    
    # Check image dimensions
    try:
        if 'Rows' in anonymized and 'Columns' in anonymized:
            verification['image_dimensions'] = f"{anonymized.Rows}x{anonymized.Columns}"
    except:
        pass
    
    return verification


def display_verification_report(verification: Dict):
    """
    Display verification report in readable format.
    
    Args:
        verification: Verification results dictionary
    """
    print("\n" + "="*80)
    print("ANONYMIZATION VERIFICATION REPORT")
    print("="*80)
    
    print(f"\n✓ Tags Removed: {verification['tags_removed']}")
    print(f"  - Original: {verification['total_tags_original']} tags")
    print(f"  - Anonymized: {verification['total_tags_anonymized']} tags")
    
    print(f"\n✓ Pixel Data: {'Present (Image preserved)' if verification['pixel_data_present'] else 'Missing'}")
    
    if verification['image_dimensions']:
        print(f"✓ Image Dimensions: {verification['image_dimensions']}")
    
    print(f"\n✓ Sensitive Tags Removed: {len(verification['sensitive_tags_removed'])}")
    for tag in verification['sensitive_tags_removed']:
        print(f"  - {tag}")
    
    if verification['sensitive_tags_still_present']:
        print(f"\n⚠ Sensitive Tags Still Present: {len(verification['sensitive_tags_still_present'])}")
        for tag in verification['sensitive_tags_still_present']:
            print(f"  - {tag}")
    else:
        print(f"\n[+] NO SENSITIVE INFORMATION REMAINING!")


print("✓ Verification functions ready")

✓ Verification functions ready


## 8. Save Anonymized DICOM Files

In [14]:
# ============================================================================
# MAIN EXECUTION - Run your anonymization here
# ============================================================================

# Define paths
input_directory = r"c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train"
output_directory = r"c:\Users\THINKPAD\Desktop\DICOM-anonymizer\anonymized_output"

print("DICOM ANONYMIZER - MAIN EXECUTION")
print("="*80)
print(f"Input Directory: {input_directory}")
print(f"Output Directory: {output_directory}")
print("="*80)

# Run batch anonymization
print("\nStarting batch anonymization...\n")
stats = batch_anonymize_dicom(input_directory, output_directory)

# Display results
print("\n" + "="*80)
print("ANONYMIZATION RESULTS")
print("="*80)
print(f"\n✓ Total Files Processed: {stats['total_files']}")
print(f"✓ Successfully Anonymized: {stats['successful']}")
print(f"❌ Failed: {stats['failed']}")
print(f"✓ Total Tags Removed: {stats['total_tags_removed']}")
print(f"\n✓ Output Directory: {stats['output_dir']}")

if stats['errors']:
    print(f"\n⚠ Errors encountered ({len(stats['errors'])}):")
    for error in stats['errors'][:5]:
        print(f"  - {error}")
    if len(stats['errors']) > 5:
        print(f"  ... and {len(stats['errors']) - 5} more errors")

print("\n" + "="*80)
print("✅ ANONYMIZATION COMPLETE!")
print("="*80)

DICOM ANONYMIZER - MAIN EXECUTION
Input Directory: c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train
Output Directory: c:\Users\THINKPAD\Desktop\DICOM-anonymizer\anonymized_output

Starting batch anonymization...

✓ Found 250 DICOM files in c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train

Processing 250 DICOM files...



Anonymizing: 100%|██████████| 250/250 [00:02<00:00, 109.68it/s]


ANONYMIZATION RESULTS

✓ Total Files Processed: 250
✓ Successfully Anonymized: 250
❌ Failed: 0
✓ Total Tags Removed: 3000

✓ Output Directory: c:\Users\THINKPAD\Desktop\DICOM-anonymizer\anonymized_output

✅ ANONYMIZATION COMPLETE!





## Optional: Verify Individual Files

In [15]:
# Compare a single original file with its anonymized version
if dicom_files and os.path.exists(output_directory):
    original_file = dicom_files[0]
    
    # Find corresponding anonymized file
    relative_path = Path(original_file).relative_to(input_directory)
    anonymized_file = os.path.join(output_directory, str(relative_path))
    
    if os.path.exists(anonymized_file):
        print(f"\nComparing files:")
        print(f"Original: {original_file}")
        print(f"Anonymized: {anonymized_file}")
        
        verification = verify_anonymization(original_file, anonymized_file)
        display_verification_report(verification)
    else:
        print("❌ Anonymized file not found")
else:
    print("⚠ No DICOM files or output directory available for verification")


Comparing files:
Original: c:\Users\THINKPAD\Desktop\DICOM-anonymizer\raw\train\no_pneumothorax\000000.dcm
Anonymized: c:\Users\THINKPAD\Desktop\DICOM-anonymizer\anonymized_output\no_pneumothorax\000000.dcm

ANONYMIZATION VERIFICATION REPORT

✓ Tags Removed: 12
  - Original: 35 tags
  - Anonymized: 23 tags

✓ Pixel Data: Present (Image preserved)
✓ Image Dimensions: 1024x1024

✓ Sensitive Tags Removed: 8
  - PatientName
  - PatientID
  - PatientBirthDate
  - ReferringPhysicianName
  - OperatorsName
  - InstitutionName
  - StudyInstanceUID
  - SeriesInstanceUID

[+] NO SENSITIVE INFORMATION REMAINING!


## Additional Utilities: Custom Anonymization

In [16]:
# ============================================================================
# CUSTOM ANONYMIZATION FUNCTION
# Use this to anonymize a specific file with custom rules
# ============================================================================

def anonymize_single_file(file_path: str, output_path: str, custom_rules: Dict = None):
    """
    Anonymize a single DICOM file with optional custom rules.
    
    Args:
        file_path: Path to DICOM file to anonymize
        output_path: Path where anonymized file will be saved
        custom_rules: Optional custom anonymization rules
    """
    try:
        # Load DICOM
        dicom_data = load_dicom(file_path)
        if dicom_data is None:
            print("❌ Failed to load DICOM file")
            return
        
        print(f"Loaded DICOM with {len(dicom_data)} tags")
        
        # Anonymize
        rules = custom_rules if custom_rules else ANONYMIZATION_RULES
        success, message = anonymize_dicom(dicom_data, rules)
        
        if not success:
            print(f"❌ {message}")
            return
        
        print(f"✓ {message}")
        print(f"New tag count: {len(dicom_data)}")
        
        # Create output directory
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        
        # Save
        dicom_data.save_as(output_path, write_like_original=False)
        print(f"✓ Saved anonymized DICOM to: {output_path}")
        
        return True
    
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return False


print("✓ Custom anonymization function available")
print("Usage: anonymize_single_file('path/to/dicom.dcm', 'path/to/output.dcm')")

✓ Custom anonymization function available
Usage: anonymize_single_file('path/to/dicom.dcm', 'path/to/output.dcm')
