# Summarize DICOM Files

## Initialize


### Imports

In [None]:
from typing import Dict, Tuple, List

from datetime import date
from pathlib import Path
import re
from functools import partial

import numpy as np
import pandas as pd
import xlwings as xw
import pydicom

### File Paths

In [None]:
base_dir = Path(r'C:\DICOM EXPORT\For Limbus')
working_dir = base_dir / 'Anonymized'
cleaned_dir = base_dir / 'Cleaned'

top_dir = str(cleaned_dir)
save_file = Path.cwd() / 'DICOM References.xlsx'

### Utility Functions

#### Create a relative path string from two paths.

In [None]:
def make_relative_path(file_path: Path, top_dir: Path)->str:
    '''Build a relative path string (starts with "./") to the given file.

    The top_dir path is removed from file_path and replaced with "./" 

    Args:
        file_path (Path): The full path to the file.
        top_dir (Path): The path to a higher level folder containing file_path.
    Returns:
        str: String representing the relative path from top_dir to file_path.  
            If file_path is not in top_dir, or a sub-directory of top_dir, the 
            full path is returned.
    '''
    top_dir_name = str(top_dir)
    path_str = str(file_path)
    relative_path_str = path_str.replace(top_dir_name, r'.')
    return relative_path_str


### Convert a list to a string for displaying in Excel

In [None]:
def list_to_str(structure_list: List[str])->str:
    '''Convert a list of strings to a string representation.
    
    This is meant to work like the __repr__() function for a list of strings, 
    but it needs to be applied to a Pandas.DataFrame column.

    Args:
        structure_list (List[str]): List of structure names

    Returns:
        str: string representation of the list.
    '''
    structure_str = '['
    for structure in structure_list:
        structure_str = structure_str + str(structure) + ', '
    if len(structure_str) > 1:
        structure_str = structure_str[0:-2] + ']'
    else:
        structure_str = structure_str + ']'
    return structure_str

#### Date conversion

In [None]:
def to_date(date_str: str)->date:
    '''Convert a date string for a date object.

    Date string is assumed to have the form: yyymmdd.

    Args:
        date_str (str): Date string in the format yyymmdd.

    Returns:
        date: A corresponding python date object.
    '''
    year = int(date_str[:4])
    month = int(date_str[4:6])
    day = int(date_str[6:])
    return date(year, month, day)

In [None]:
def from_date(date_val: date)->str:
    '''Convert a date object to a string with the form: yyymmdd.

    Args:
        date_val (date): A python date object.

    Returns:
        str: The corresponding string in the form: yyymmdd.
    '''
    date_str = f'{date_val.year:04d}{date_val.month:02d}{date_val.day:02d}'
    return date_str

In [None]:
def date_shift(date_str: str, date_offset: date, 
               ref_date=date(2000, 1, 1))->str:
    '''Shift the supplied date

     Shift the date such that the the difference between the new date and the 
     reference date is the same a the difference between the original date and 
     the date offset,

    Args:
        date_str (str): Date string in the format yyymmdd.
        date_offset (date): A reference date for date_str.
        ref_date (date, optional): The new reference date. 
            Defaults to date(2000, 1, 1).

    Returns:
        str: The adjusted date string in the format yyymmdd.
    '''
    date_dif = to_date(date_str) - date_offset
    new_date = ref_date + date_dif
    new_date_str = from_date(new_date)
    return new_date_str

### Applicator code
Code indicates what applicator is used.

Code is built from private DICOM tags in the Oncentra plan.

**Code Format**<br>
> `RT_A`##`R`##`T`##

- **RT**: Ring & Tandem
- **A**: Angle of tandem in degrees
- **R**: Diameter of ring in mm
- **T**: Length of tandem in mm

Example: `RT_A60R30T40`

In [None]:
def build_appl_code(dataset: pydicom.Dataset)->Dict[str, str]:
    '''Construct an applicator code for a given RT-DICOM Plan Dataset.
    
    If successful the returning dictionary will contain the following items:
    Ring: Ring diameter in mm.
    Tandem: Tandem length in mm.
    Angle: Tandem angle from ring in degrees.
    Applicator: Applicator Code (as described below)

    The Code Format is: RT_A##R##T##
    Where:
        - RT: Ring & Tandem
        - A: Angle of tandem in degrees
        - R: Diameter of ring in mm
        - T: length of tandem in mm
    Example: RT_A60R30T40
    
    If applicator information is present, but not in the expected format, the
    applicator string is returned as the only item in the dictionary.
    
    If applicator information is not present an empty dictionary is returned.

    Args:
        dataset (pydicom.Dataset): RT-DICOM Plan Dataset

    Returns:
        Dict[str, str]: Applicator parameters and Code String or empty string if applicator 
             information was not found.
    '''
    appl_ptrn = re.compile(
        r'.*?d=(?P<Ring>[0-9]+)mm'    ## Get ring diameter in mm
        r'.*?l=(?P<Tandem>[0-9]+)mm'  ## Get tandem length in mm
        r'.*?(?P<Angle>[0-9]+)°'      ## Get angle in degrees
        )
    code_template = 'RT_A{Angle}R{Ring}L{Tandem}'
    # Extract Applicator info from private tags in the DICOM header.
    applicator_seq = dataset.get('ApplicationSetupSequence')
    if not applicator_seq:
        # Bail if Applicator sequence not found
        return {}  
    applicator_ds = applicator_seq[0]
    appl_str = applicator_ds.get_item((0x300B, 0x1011)).value
    if not appl_str:
        # Bail if Applicator sting not found
        return {}  
    # Fix the formatting for the degree symbol and convert value to a string.
    appl_str = appl_str.replace(b'\xb0', b'\xc2\xb0').decode()
    # extract Angle, ring diameter, and tandem length from Applicator name.
    appl_param = appl_ptrn.match(appl_str).groupdict()
    if appl_param is None:
        # Return the raw applicator string if the format was not recognized.
        return {'Applicator': appl_str}
    # Make the code string
    appl_code = code_template.format(**appl_param)
    appl_param['Applicator'] = appl_code
    return appl_param

In [None]:
def get_apl_info(dicom_ref: pd.DataFrame)-> pd.DataFrame:
    '''Get the applicator info for each fraction.

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.

    Returns:
        pd.DataFrame: Table of applicator info with 'PatientID' and 'StudyDate'
            ad Index
    '''
    plan_files = dicom_ref.Modality.str.contains('RTPLAN')
    apl_ref = dicom_ref.loc[plan_files, ['PatientID', 'StudyDate', 'FilePath']]
    apl_ref.set_index('FilePath', inplace=True)

    apl_list = []
    for file in apl_ref.index:
        dataset = pydicom.dcmread(file)
        apl_data = build_appl_code(dataset)
        apl_data['PatientID'] = apl_ref.at[file, 'PatientID']
        apl_data['StudyDate'] = apl_ref.at[file, 'StudyDate']
        apl_list.append(apl_data)
    apl_info = pd.DataFrame(apl_list)
    apl_info.drop_duplicates(inplace=True)
    apl_info.set_index(['PatientID', 'StudyDate'], inplace=True)
    apl_info.dropna(inplace=True)    
    return apl_info

## Data Cleaning
As part of second-round data cleaning, need to make the following changes to 
the DICOM header:
- Replace with fixed Values:
> - `(0008,1048)	Physicians Of Record`
> - `(0008,1070)	Operators Name`
> - Date of birth

- Change tag from *BU0-045-KC8460_1* to *Oncentra* or *Eclipse*
> `(0002,0016)	Source Application Entity Title`
> 
- Change other dates to date relative to earliest date (which is set to 2000-01-01)
- Find these dates using the date value type
> 

After changes are made, 
Save to revised directory structure:
	Applicator → Patient → Date → Modality → Source → Series ID (or other relevant label)



### Initial pass through data to collect aggregate information.
**Information Required:**
| Information           | Obtained from                      |
|-----------------------|------------------------------------|
| Data Source           | Eclipse or Oncentra in file path   |
| Earliest Patient Date | Study Dates                        |
| Practice plans        | Study Dates with no image sets     |
| Fraction Date         | Study Dates with plans             |
| Total Fractions       | Count of fraction dates            |
| Fraction Number       | Cumulative Count of fraction dates |
| Applicator Code       | RT plan with same study date       |
| MRI Images            | Is MR Modality in Study Date       |
| No applicator Images  | Study Date with no plan            |
| Number Of Slices      | Count of image files by series UID |
| Notes                 | Custom csv file with links         |



#### Data Source *Eclipse* or *Oncentra*

In [None]:
def get_source(dicom_ref: pd.DataFrame)->pd.DataFrame:
    '''Determine file source from path.

    Adds a column containing 'Eclipse' or 'Oncentra' depending on where the 
    DICOM file originated.

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.

    Returns:
        pd.DataFrame: The supplied table with a new column 'Source'.
    '''
    eclipse_files = dicom_ref.Folder.str.contains('Eclipse')
    dicom_ref['Source'] = 'Oncentra'
    dicom_ref.loc[eclipse_files, 'Source'] = 'Eclipse'
    return dicom_ref

### Scan DICOM files

In [None]:
def get_file_ref(file: Path)->Dict[str, str]:
    '''Extract key data from the DICOm header

    The following information is extracted:
        'FilePath': DICOM file Path,
        'Folder': String path that DICOM file is in.,
        'FileName': The name of the DICOM file,
        'PatientID': Patient ID
        'Modality': Image or RT-DICOM modality,
        'StudyDate': Date study that the file is associated with,
        'Study_UID': Study Instance UID,
        'Series_UID': Series Instance UID,
        'Object_UID': Object Instance UID,
        'FrameOfReferenceUID': Frame Of Reference UID,
        'SeriesDescription': Series Description text,
        'SeriesNumber': Series number within Study
    Args:
        file (Path): Path to the DICOM file
    Returns:
        Dict[str, str]: Dictionary containing key information about the file.
    '''
    dataset = pydicom.dcmread(file)
    file_ref = {
        'FilePath': file,
        'Folder': str(file.parent),
        'FileName': file.name,
        'PatientID': dataset.get('PatientID'),
        'Modality': dataset.get('Modality'),
        'StudyDate': dataset.get('StudyDate'),
        'Study_UID': dataset.get('StudyInstanceUID'),
        'Series_UID': dataset.get('SeriesInstanceUID'),
        'Object_UID': dataset.get('SOPInstanceUID'),
        'FrameOfReferenceUID': dataset.get('FrameOfReferenceUID'),
        'SeriesDescription': dataset.get('SeriesDescription'),
        'SeriesNumber': dataset.get('SeriesNumber')
        }
    return file_ref

In [None]:
def initial_scan(scan_dir: Path)->pd.DataFrame:
    '''Get key information from collection of DICOM files.

    For each DICOM file found, extract key information and create a table with 
    this information.

    Args:
        scan_dir (Path): Path to the folder containing DICOM files

    Returns:
        pd.DataFrame: Table of key information for all DICOM files in the folder.
    '''
    file_list = []
    for file in scan_dir.glob('**/*.dcm'):
        file_ref = get_file_ref(file)
        file_list.append(file_ref)
    dicom_ref = pd.DataFrame(file_list)
    dicom_ref = get_source(dicom_ref)
    return dicom_ref

### Aggregate info functions
The following functions are used to generate aggregate information based on the 
initial information extracted from the DICOM files.

#### Number of slices per image set

In [None]:
def count_slices(dicom_ref: pd.DataFrame)->pd.Series:
    '''Count number of image slices for each series.

    Counts files for each Series UID

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.

    Returns:
        pd.Series: Number of image files by Series UID
    '''
    image_modalities = ['MR', 'CT', 'PT']
    image_file = dicom_ref.Modality.isin(image_modalities)

    slice_cnt = dicom_ref.loc[image_file, :].groupby('Series_UID').count().Folder
    slice_cnt.name = 'NumOfSlices'
    return slice_cnt

#### Find the earliest date to use for date offset.

In [None]:
def get_date_offset(dicom_ref: pd.DataFrame)->pd.Series:
    '''Find the earliest Study Date for each patient.

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.
        
    Returns:
        pd.Series: Earliest Study Date for each patient.
    '''
    date_offset = dicom_ref.groupby('PatientID').StudyDate.min()
    date_offset.name = 'DateOffset'
    date_offset = date_offset.apply(to_date)
    return date_offset

#### Count Modalities
For each patient and date, count the number of files for each different modality.

In [None]:
def count_modalities(dicom_ref: pd.DataFrame)-> pd.DataFrame:
    '''Build table of number of files for each different modality by date.

    Date is used to distinguish between fractions and between treatment and 
    non-treatment dates.  We only care about image and plan counts so other 
    modalities are dropped. 

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.

    Returns:
        pd.DataFrame: Table of number of files by modality and date.
    '''
    # Count files sub-grouped by patient, study date, and modality
    mod_grp = ['PatientID', 'StudyDate', 'Modality']
    # Do not need to count registration,structure set or DRR files
    drop_cols = ['REG', 'RTIMAGE', 'RTSTRUCT', 'RTDOSE']
    
    # Count my modality for each date. Select one column to get a Series
    mod_cnt = dicom_ref.groupby(mod_grp).count().Folder
    # Create a table of date vs number of files for each modality
    mod_cnt = mod_cnt.unstack('Modality')
    # Remove columns for modalities we don't care about counting
    mod_cnt.drop(columns=drop_cols, inplace=True)
    return mod_cnt

#### Identify study dates where only *RTPLAN* modalities occur.
These will need to be discarded because they are practice or QA plans.

In [None]:
def solo_plans(mod_cnt: pd.DataFrame)-> pd.Series:
    '''Identify study dates where only RTPLAN modalities occur.
    
    These will need to be discarded because they are practice or QA plans.
    
    Args:
        mod_cnt (pd.DataFrame): Table of number of files by modality and date.

    Returns:
        pd.Series: Boolean series by PatientId and Study Date. True if 
            Study Date contains only RTPLAN modalities.    
    '''
    img_cnt = mod_cnt.drop(columns='RTPLAN')
    no_image = img_cnt.isna().all(axis='columns')
    no_image.name = 'NoImages'
    return no_image

##### Fraction Number and Total Fractions

In [None]:
def count_fractions(mod_cnt: pd.DataFrame)->Tuple[pd.Series]:
    '''Determine the fraction number total number of fractions for each patient.

    Fractions are defined as Study Dates containing both an RTPLAN and CT 
    Modality files.

    Args:
        mod_cnt (pd.DataFrame): Table of number of files by modality and date.

    Returns:
        Tuple[pd.Series]: Total Fractions by patient, Fraction number by 
            PatientId and Study Date.
    '''
    has_plan = ~mod_cnt.RTPLAN.isna()
    fraction_dates = mod_cnt.loc[has_plan, 'CT']
    total_fractions = fraction_dates.groupby(level='PatientID').count()
    total_fractions.name = 'TotalFractions'
    
    fraction_num = fraction_dates.groupby(level=['PatientID']).cumcount()
    fraction_num = fraction_num + 1
    fraction_num.name = 'FractionNum'
    return total_fractions, fraction_num

##### Check for MR image sets

In [None]:
def dates_with_mr(mod_cnt: pd.DataFrame)->pd.Series:
    '''Identify Study Dates that contain MR imaging.

    Args:
        mod_cnt (pd.DataFrame): Table of number of files by modality and date.

    Returns:
        pd.Series: Boolean series by PatientId and Study Date. True if 
            Study Date contains an MR image set.
    '''
    has_mr = ~mod_cnt.MR.isna()
    has_mr.name = 'HasMR'
    return has_mr

##### Check for patients with image sets where applicator is not present. 

In [None]:
def dates_wo_applicator(mod_cnt: pd.DataFrame)->pd.Series:
    '''Identify patients that have image sets without applicators.

    Args:
        mod_cnt (pd.DataFrame): Table of number of files by modality and date.

    Returns:
        pd.Series: Boolean series by PatientId. True if patient has image sets 
            without applicators.
    '''
    has_plan = ~mod_cnt.RTPLAN.isna()
    non_rx_dates = mod_cnt.loc[~has_plan, 'CT']
    has_non_apl_img = non_rx_dates.groupby(level='PatientID').count() > 0
    has_non_apl_img.name = 'HasReferenceImages'
    return has_non_apl_img

#### Create revised directory structure:
The file folder system has the following hierarchy:
> Patient → Applicator → Date → Modality → Source → Series Description

If the Series Description create an alternative label from the series 
number in the format: 
> Series Number *##*


In [None]:
def build_label(dicom_ref: pd.DataFrame)->pd.Series:
    '''Create a series label

    By default the label is the Series Description.  If the Series Description
    is missing, create a label from the series number in the format: 
        `Series Number ##`

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.
        
    Returns:
        pd.Series: Series label from Series Description or Series Number.
    '''
    def series_num_string(num):
        num_s = str(num)
        return f'Series Number {num_s}'

    series_num = dicom_ref.SeriesNumber.apply(series_num_string)

    series_label = dicom_ref.SeriesDescription
    series_label.where(~series_label.isna(), series_num, inplace=True)
    return series_label


def build_path(dicom_ref: pd.DataFrame, new_dir: Path)->pd.Series:    
    '''Create a new file path for saving the modifies file.

    The file folder system has the following hierarchy:
	    PatientId → 
            ApplicatorCode → 
                SeriesDate → 
                    Modality → 
                        Source → 
                            SeriesDescription
    If the Series Description create an alternative label from the series 
    number in the format: `Series Number ##`

    Args:
        dicom_ref (pd.DataFrame): Reference data for all DICOM files.
        new_dir (Path): Top level directory to store modified DICOM files in.
        
    Returns:
        pd.Series: New save path for the DICOM file.
    '''    
    dicom_ref['SeriesLabel'] = build_label(dicom_ref)
    path_cols = ['Applicator', 'StudyDate', 'Modality', 
                'Source', 'SeriesLabel', 'FileName']
    path_parts = dicom_ref[path_cols]

    sub_path = dicom_ref.PatientID.str.cat(path_parts, sep='\\')
    new_path = new_dir / sub_path 
    return new_path

### Extract aggregate information

In [None]:
def extract_reference_info(dicom_files: pd.DataFrame, 
                           new_dir: Path)->pd.DataFrame:
    '''Get aggregate info from the initial reference data. 
    
    At the patient level, information collected is:
        The earliest Study date.
        The total number of fractions.
        Whether reference image sets without an applicator are present.
        
    At the Study Date level, information collected is:        
        The fraction number.
        The applicator used for the fraction.
        Whether an MR image set is available for the given fraction.
        Series to be discarded because they are for QA or practice planning.

    At the Series level, information collected is:        
        The number of slices in the series.
        
    Based on the aggregate info a new "Save Path" for each file is generated. 

    Args:
        dicom_files (pd.DataFrame): Reference data for all DICOM files.
        new_dir (Path): Top level directory to store modified DICOM files in.

    Returns:
        pd.DataFrame: Reference data for all DICOM files with additional 
        aggregate info added.
    '''    
    apl_info = get_apl_info(dicom_files)
    date_offset = get_date_offset(dicom_files)
    mod_cnt = count_modalities(dicom_files)
    no_image = solo_plans(mod_cnt)
    # Drop Series Dates where only RTPLAN modalities occur.
    # These will need to be discarded because they are practice or QA plans.
    mod_cnt = mod_cnt.loc[~no_image, :].copy()

    total_fractions, fraction_num = count_fractions(mod_cnt)
    has_mr = dates_with_mr(mod_cnt)
    has_non_apl_img = dates_wo_applicator(mod_cnt)

    pt_lvl_info = pd.concat([date_offset, total_fractions, has_non_apl_img], 
                            axis='columns')
    pt_lvl_info.HasReferenceImages.fillna(False, inplace=True)
    dicom_ref = dicom_files.join(pt_lvl_info, on='PatientID')

    dt_lvl_info = pd.concat([fraction_num, has_mr, apl_info, no_image], 
                            axis='columns')
    dicom_ref = dicom_ref.join(dt_lvl_info, on=['PatientID', 'StudyDate'])

    slice_cnt = count_slices(dicom_ref)
    dicom_ref = dicom_ref.join(slice_cnt, on='Series_UID')

    # Discard Series Dates where only RTPLAN modalities occur because they are 
    # practice or QA plans.
    dicom_ref = dicom_ref.loc[~dicom_ref.NoImages, :].copy()

    dicom_ref.Applicator.fillna('None', inplace=True)

    dicom_ref['NewPath'] = build_path(dicom_ref, new_dir)
    
    return dicom_ref

### Secondary cleaning
Functions that use aggregate information as part of anonymizing the data.

#### Convert dates
> 
- Change other dates to date relative to earliest date (which is set to 2000-01-01)
- Find these dates using the date value type `DA`.
> - `(0008, 0012) Instance Creation Date              DA: '20230414'`
> - `(0008, 0020) Study Date                          DA: '20220906'`
> - `(0008, 0021) Series Date                         DA: '20220906'`
> - `(0008, 0022) Acquisition Date                    DA: '20220906'`
> - `(0008, 0023) Content Date                        DA: '20220906'`
> - `(0010, 0030) Patient's Birth Date                DA: '19570514'`


In [None]:
def date_mod(dataset: pydicom.Dataset, data_element: pydicom.DataElement, 
             date_offset: date, ref_date=date(2000, 1, 1), 
             birth_date=date(1900, 1, 1)):
    '''A pydicom callback function root that anonymizes dates.

    If data_element is 'PatientBirthDate' change the value to birth_date.
    
    If data_element is any other date, shift the date by an offset from 
    ref_date that matches the different between the data_element date and 
    date_offset.
    
    Args:
        dataset (pydicom.Dataset): The DICOM dataset to be modified
        data_element (pydicom.): The DICOM element to be modified
        date_offset (date): The original reference date to compare found 
            dates with.
        ref_date (date, optional): The reference date to calculate the new date 
            offset from. Defaults to date(2000, 1, 1).
        birth_date (date, optional): The date to use as a replacement for 
            'PatientBirthDate'. Defaults to date(1900, 1, 1).
    '''
    if data_element.VR == 'DA':
        if data_element.tag == 'PatientBirthDate':
            data_element.value = from_date(birth_date)
        else:
            new_date = date_shift(data_element.value, date_offset, ref_date)
            data_element.value = new_date


def anonymize_dates(dataset: pydicom.Dataset, date_offset: date, 
                    ref_date=date(2000, 1, 1), 
                    birth_date=date(1900, 1, 1))->pydicom.Dataset:
    '''Anonymize all dates in the dataset while maintaining the same relative 
    dates.
    
    The 'PatientBirthDate' element is changed to birth_date.
    
    All other dates are shifted so that they maintain the same relative time 
    difference.  The dates found are compared with date_offset and then set to 
    a new date that has the same relative time difference with ref_date as the 
    original date had with date_offset.

    Args:
        dataset (pydicom.Dataset): The DICOM dataset to be modified.
        date_offset (date): The original reference date to compare found 
            dates with.
        ref_date (date, optional): The reference date to calculate the new date 
            offset from. Defaults to date(2000, 1, 1).
        birth_date (date, optional): The date to use as a replacement for 
            'PatientBirthDate'. Defaults to date(1900, 1, 1).
            
    Returns:
        pydicom.Dataset: The supplied DICOM dataset with modified dates.
    '''
    date_shifter_callback = partial(date_mod, date_offset=date_offset,
                                    ref_date=ref_date, birth_date=birth_date)
    dataset.walk(date_shifter_callback)
    return dataset

#### Replace staff names with blank strings:
> - `(0008,1048)	Physicians Of Record`
> - `(0008,1070)	Operators Name`
> - `(0008,0090)	Referring Physician Name`

In [None]:
def anonymize_names(dataset: pydicom.Dataset)-> pydicom.Dataset:
    '''Anonymize all staff names in the dataset.
    
    Elements with the tag 'OperatorsName' or 'PhysiciansOfRecord have their 
    values are converted to empty strings.
    Args:
        dataset (pydicom.Dataset): The DICOM dataset to be modified.
            
    Returns:
        pydicom.Dataset: The supplied DICOM dataset with name elements.
    '''
    def no_name_callback(dataset, data_element):
        if data_element.tag in ['OperatorsName', 
                                'PhysiciansOfRecord',
                                'ReferringPhysicianName']:
            data_element.value = ''
        
    dataset.walk(no_name_callback)
    return dataset

#### Add the Source information to a DICOM tag

1. Locate the first item in 'Contributing Equipment Sequence', if it is present.
2. Verify that the value of the 'Contribution Description' is 'Cleaned' 
3. Find the 'Station Name' element and replace its value with `Source`.


```
> (0018, A001) Contributing Equipment Sequence     1 item(s) ---- 
>     (0008, 0070) Manufacturer                        LO: 'PixelMed'
>     (0008, 1010) Station Name                        SH: 'Oncentra'
>     (0008, 1090) Manufacturer's Model Name           LO: 'DicomCleaner'
>     (0018, 1020) Software Versions                   LO: 'Wed Dec 18 15:38:40 EST 2019'
>     (0018, a002) Contribution DateTime               DT: '20230417122754.967-0400'
>     (0018, a003) Contribution Description            ST: 'Cleaned'
>     (0040, a170)  Purpose of Reference Code Sequence  1 item(s) ---- 
>        (0008, 0100) Code Value                          SH: '109104'
>        (0008, 0102) Coding Scheme Designator            SH: 'DCM'
>        (0008, 0104) Code Meaning                        LO: 'De-identifying Equipment'
```

**Consider switching to**
`(0002,0016)	Source Application Entity Title	BU0-045-KC8460_1`

In [None]:
def add_source(dataset: pydicom.Dataset, source: str)-> pydicom.Dataset:
    '''Add the data source name to the dataset.

   Replaces the 'StationName' value in the first 'ContributingEquipmentSequence' 
   item with source.

    Args:
        dataset (pydicom.Dataset): The DICOM dataset to be modified.
        source (str): The DICOM file data source 'Eclipse or 'Oncentra'.

    Returns:
        pydicom.Dataset: The supplied DICOM dataset with source added.
    '''
    sequence_tag = 'ContributingEquipmentSequence'
    description_tag = 'ContributionDescription'
    expected_description = 'Cleaned'
    source_tag ='StationName'

    if sequence_tag in dataset:
        ds = dataset.data_element(sequence_tag)[0]
        purpose = ds.get(description_tag)
        if purpose == expected_description:
            station = ds.data_element(source_tag)
            station.value = source
    return dataset

#### Apply Modifications and save modified files

In [None]:
def dicom_mod(dicom_ref: pd.DataFrame, ref_date=date(2000, 1, 1), 
              birth_date=date(1900, 1, 1)):
    '''perform secondary anonymization of the DICOM files using aggregated 
    reference data.

    Change patient date of birth to birth_date.
    Convert other dates to a new date relative to earliest date.
    Replace staff names with blank strings.
    Add the data source name to the dataset.
    Save the modified files.

    Args:
        dicom_ref (pd.DataFrame): Reference data, including aggregate info for 
            all DICOM files.
        ref_date (date, optional): The reference date to calculate the new date 
            offset from. Defaults to date(2000, 1, 1).
        birth_date (date, optional): The date to use as a replacement for 
            'PatientBirthDate'. Defaults to date(1900, 1, 1).
    '''       
    def save_dicom_file(dataset: pydicom.Dataset, save_path: Path):
        '''Save the modified DICOM file.
        Args:
            dataset (pydicom.Dataset): Modified DICOM file contents.
            save_path (Path): Full path to save the modified file.
        '''
        folder = save_path.parent
        if not folder.exists():
            folder.mkdir(parents=True)
        dataset.save_as(save_path)   

        
    for row in dicom_ref.itertuples(index=False):
        dataset = pydicom.dcmread(row.FilePath)
        earliest_date = row.DateOffset
        source = row.Source
        new_file_path = row.NewPath
        dataset = anonymize_dates(dataset, earliest_date, ref_date=ref_date, 
                                birth_date=birth_date)
        dataset = anonymize_names(dataset)
        dataset = add_source(dataset, source)
        
        save_dicom_file(dataset, new_file_path)


## Reference Tables

### Structure Set Lookup
*Create a table of Structure sets to be linked to image sets.*

Table contents:
|Column Name        |Description                                           |
|-------------------|------------------------------------------------------|
|FilePath           |Full path to the Structure Set DICOM file             |
|RelativePath       |Relative path string to be used by the Image Set table|
|SeriesReference    |Series index in the form: StudyId.SeriesNumber        |
|StructureSetID     |Label for the Structure Set                           |
|FrameOfReferenceUID|Frame Of Reference UID for the Structure Set          |
|ReferencedImageSet |Series UID for the associated image set               |


In [None]:
def build_structure_set_lookup(dicom_ref: pd.DataFrame, 
                               top_dir: Path)->pd.DataFrame:
    '''Create a table of Structure sets to be linked to image sets.

    Args:
        dicom_ref (pd.DataFrame): Reference data, including aggregate info for 
            all DICOM files.
        top_dir (Path): The path to a top level folder containing DICOM files.
    '''
    struct_set_mask = dicom_ref.Modality.isin(['RTSTRUCT'])
    struct_set_ref = dicom_ref.loc[struct_set_mask, :]
    structure_set_list = []
    for row in struct_set_ref.itertuples(index=False):
        structure_file = row.NewPath
        relative_path = make_relative_path(structure_file, top_dir)
        dataset = pydicom.dcmread(structure_file)
        structure_set_ref = '.'.join([
            str(dataset.get('StudyID','')), 
            str(dataset.get('SeriesNumber',''))
            ])
        structure_set_id = dataset.get('StructureSetLabel')
        fr_ds = dataset.get('ReferencedFrameOfReferenceSequence')[0]
        study_ref = fr_ds.get('RTReferencedStudySequence')[0]
        series_ref = study_ref.get('RTReferencedSeriesSequence')[0]   
        frame_of_reference_UID = fr_ds.get('FrameOfReferenceUID') 
        referenced_series_UID = series_ref.get('SeriesInstanceUID')
        structure_set_dict = {
            'FilePath': structure_file,
            'RelativePath': relative_path,
            'SeriesIndex': structure_set_ref,
            'StructureSetID': structure_set_id,
            'FrameOfReferenceUID': frame_of_reference_UID,
            'ReferencedImageSet': referenced_series_UID,
            'Source': row.Source
            }
        structure_set_list.append(structure_set_dict)

    structure_set_lookup = pd.DataFrame(structure_set_list)
    structure_set_lookup.sort_values(['ReferencedImageSet', 'Source'], 
                                     inplace=True)    
    structure_set_lookup.set_index(['ReferencedImageSet', 'Source'], 
                                   inplace=True)
    return structure_set_lookup

### Image Set Table
| Column               |      Description                    |Example|
|----------------------|-------------------------------------|-------|
| SeriesIndex          | StudyID.SeriesID                    |38805.2|
| Modality             | Image modality CT or MR             |CT|
| Applicator           | Applicator code with type & size    |A60R34L40|
| PatientID            | CCSEO + Patient Code                |CCSEO005|
| Source               | Exported from Eclipse or Oncentra   |Eclipse|
| Notes                | Special info (to be added manually) |Prosthesis|
| SeriesDescription    | Free text label                     |SAG T2 HIRES|
| Slices               | Number of image slices              |304|
| SliceThickness       | Slice Thickness in mm               |2|
| ImageResolution      | Image Resolution in mm              |1.2|
| StructureSet         | Matching Structure Set label        |HDR FR3|
| StructureSetFile| Relative path to Structure Set Files|.\CCSEO003\ ... \####.dcm|
|StructureSetIndex     | SeriesIndex for Matching Structure Set|38040.21|
|Study_UID             | Image Set Study UID|1.2.840.113704.1.111.7264.16182.10|
|Series_UID            | Image Set Series UID|1.2.840.113704.1.111.7264.1682.10|
|FrameOfReferenceUID   | Image Set Frame of Reference UID|1.2.840.11.7264.12.10|
|Folder            | Relative path to Image Set Files|.\CCSEO003\ ... \####.dcm|

In [None]:
def add_structure_set_ref(structure_set_lookup: pd.DataFrame, series_uid: str, 
                          source: str)->Dict[str, str]:
    '''Get Structure set references for a given image series.

    Find matching Structure Set references for a given image series and source.
    If more than one Structure Set reference is present, combine them as a comma 
    separated string.

    Args:
        structure_set_lookup (pd.DataFrame): Table of Structure sets to be 
            linked to image sets
        series_uid (str): UID of the series. The first part of the 
            structure_set_lookup index.
        source (str): Series export source: 'Eclipse' or 'Oncentra' The second 
            part of the structure_set_lookup index.

    Returns:
        Dict[str, str]: _description_
    '''
    def multi_set_merge(same_ref_col):
        return same_ref_col.str.cat(sep=', ')
    columns_names = {
        'StructureSetID': 'StructureSet',
        'RelativePath': 'StructureSetFile',
        'SeriesIndex': 'StructureSetIndex'
    }
    col_select = list(columns_names.keys())
    structure_set_ref = structure_set_lookup.loc[(series_uid, source),
                                                 col_select]
    structure_set_ref.rename(columns=columns_names, inplace=True)
    if structure_set_ref.shape[0] > 1:
        merged_ref = structure_set_ref.apply(multi_set_merge)
        merged_dict = merged_ref.to_dict()
    else:
        merged_dict = structure_set_ref.to_dict(orient='records')[0]
    return merged_dict
        

In [None]:
def build_image_table(dicom_ref: pd.DataFrame, 
                      structure_set_lookup: pd.DataFrame, 
                      top_dir: Path)->pd.DataFrame:
    '''Build a table describing all Image sets.

    Args:
        dicom_ref (pd.DataFrame): Reference data, including aggregate info for 
            all DICOM files.
        structure_set_lookup (pd.DataFrame): Table of Structure sets to be 
            linked to image sets
        top_dir (Path): The path to a top level folder containing DICOM files.

    Returns:
        pd.DataFrame: Table describing all Image sets
    '''
    image_set_mask = dicom_ref.Modality.isin(['CT', 'MR'])
    image_set_ref = dicom_ref.loc[image_set_mask, :]
    image_list = []
    for row in image_set_ref.itertuples(index=False):
        image_file_path = row.NewPath
        dataset = pydicom.dcmread(image_file_path)
        # Build an index to the series in the form: StudyID.SeriesNumber
        series_ref = '.'.join([
            str(dataset.get('StudyID','')), 
            str(dataset.get('SeriesNumber',''))
            ])
        # Assuming the image is square, extract the resolution from one axis.
        res = dataset.get('PixelSpacing')
        if res:
            img_resolution = res[0]
        else:
            img_resolution = np.nan
        # Generate the relative path to the folder containing the image series.
        image_folder = image_file_path.parent
        relative_folder = make_relative_path(image_folder, top_dir)
        # Get the Series UID and source to search for a matching structure set
        series_uid = row.Series_UID
        source = row.Source
        # Begin building the table elements
        image_dict = {
            'SeriesIndex': series_ref,
            'Modality': row.Modality,
            'Applicator': row.Applicator,
            'PatientID': row.PatientID,
            'Source': source,
            'SeriesDescription': row.SeriesDescription,
            'Slices': row.NumOfSlices,
            'SliceThickness': dataset.get('SliceThickness'),
            'ImageResolution': img_resolution,
            'Study_UID': row.Study_UID,
            'Series_UID': row.Series_UID,
            'FrameOfReferenceUID': row.FrameOfReferenceUID,
            'Folder': relative_folder
            }    
        # Add Structure set information if it exists
        if (series_uid, source) in structure_set_lookup.index:
            structure_set_dict = add_structure_set_ref(structure_set_lookup, 
                                                       series_uid, source)
            image_dict.update(structure_set_dict)
        image_list.append(image_dict)
    image_table = pd.DataFrame(image_list)
    image_table.drop_duplicates(inplace=True)
    image_table.set_index('SeriesIndex', inplace=True)
    image_table.fillna('', inplace=True)
    return image_table

### Structure Set Table
| Column               |      Description                    |Example   |
|----------------------|-------------------------------------|----------|
| Structure ID           | Get from Structure DICOM file             |
| Structure Set ID     | Get from Structure DICOM file       |          |
| Series Description   | Free text label                     |SAG T2 HIRES|
| Image Modality       | Get from referenced series          |CT        |
| Applicator           | Applicator code with type & size    |A60R34L40 |
| Patient ID           | CCSEO + Patient Code                |CCSEO005  |
| Source               | Exported from Eclipse or Oncentra   |Eclipse   |
| Notes                | Special info (To be added manually) |Prosthesis|
| Slices Contoured       | Get from Structure DICOM file             |
| Structure Resolution   | If possible: Normal or High Res           |
| Structure Label        | Get from Structure DICOM file             |
| Code Scheme            | Get from Structure DICOM file             |
| Code Scheme Version    | Get from Structure DICOM file             |
| ROI Number             | Get from Structure DICOM file             |
| Structure Set UID      | Get from DICOM file                       |
| Image Series UID       | Get from DICOM file                       |
| FrameOfReferenceUID   | Image Set Frame of Reference UID|1.2.840.11.7264.12.10|
| StructureSetIndex     | SeriesIndex for Structure Set|38040.21|
| StructureSetFile| Relative path to Structure Set File|.\CCSEO003\ ... \####.dcm|


### Get contour information

In [None]:
def get_contour_info(dataset: pydicom.Dataset)->pd.DataFrame:
    def get_roi_number(dataset):
        roi_list = []
        roi_seq = dataset.get('StructureSetROISequence')
        for roi in roi_seq:
            roi_dict = {
                'ROI_Number': roi.get('ROINumber'),
                'Name': roi.get('ROIName')            
            }
            roi_list.append(roi_dict)
        roi_num = pd.DataFrame(roi_list)
        roi_num.set_index('ROI_Number', inplace=True)
        return roi_num
    
    def get_roi_contours(dataset):
        roi_contours = []
        roi_seq = dataset.get('ROIContourSequence')
        for roi in roi_seq:
            roi_dict = {'ROI_Number': roi.get('ReferencedROINumber')}
            slice_seq = roi.get('ContourSequence')
            if slice_seq:
                roi_dict['SlicesContoured'] = len(slice_seq)
            else:
                roi_dict['SlicesContoured'] = 0
            roi_contours.append(roi_dict)
        roi_count = pd.DataFrame(roi_contours)
        roi_count.set_index('ROI_Number', inplace=True)
        return roi_count
    
    def get_roi_ref(dataset):
        roi_ref = []
        roi_seq = dataset.get('RTROIObservationsSequence')
        for roi in roi_seq:
            roi_dict = {'ROI_Number': roi.get('ReferencedROINumber'),
                        'ID': roi.get('ROIObservationLabel')}
            roi_id_seq = roi.get('RTROIIdentificationCodeSequence')
            if roi_id_seq:
                roi_id = roi_id_seq[0]
                roi_dict['StructureCode'] = roi_id.get('CodeValue')
                roi_dict['StructureLabel'] = roi_id.get('CodeMeaning')
                roi_dict['CodeScheme'] = roi_id.get('CodingSchemeDesignator')
                roi_dict['CodeSchemeVersion'] = roi_id.get('CodingSchemeVersion')
            roi_ref.append(roi_dict)
        roi_id = pd.DataFrame(roi_ref)
        roi_id.set_index('ROI_Number', inplace=True)
        return roi_id
    
    struct_data = [
        get_roi_number(dataset),
        get_roi_contours(dataset),
        get_roi_ref(dataset)
        ]
    contours = pd.concat(struct_data, axis='columns')
    contours.reset_index(inplace=True)
    contours['StructureSetUID'] = dataset.get('SOPInstanceUID')
    contours.set_index('StructureSetUID', inplace=True)
    return contours

In [None]:
def scan_structure_files(structure_set_lookup, image_table):
    image_lookup = image_table.set_index(['Series_UID', 'Source'], drop=False)
    structure_set_info = structure_set_lookup.join(image_lookup, how='inner', 
                                                   rsuffix='Image')
    structure_set_list = []
    contours_list = []
    for row in structure_set_info.itertuples(index=False):
        dataset = pydicom.dcmread(row.FilePath)
        structure_set_dict = {
            'StructureSetID': row.StructureSetID,
            'SeriesDescription': row.SeriesDescription,
            'ImageModality': row.Modality,
            'Applicator': row.Applicator,
            'PatientID': row.PatientID,
            'Source': row.Source,
            'StructureSetUID': dataset.get('SOPInstanceUID'),
            'ImageSeriesUID': row.Series_UID,
            'FrameOfReferenceUID': row.FrameOfReferenceUID,
            'StructureSetIndex': row.StructureSetIndex,
            'StructureSetFile': row.StructureSetFile
            }
        structure_set_list.append(structure_set_dict)
        contours_list.append(get_contour_info(dataset))
    contour_info = pd.concat(contours_list, axis='rows')
    structure_set_table = pd.DataFrame(structure_set_list)
    structure_set_table.set_index('StructureSetUID', drop=False, inplace=True)
    contour_table = contour_info.join(structure_set_table)
    row_selection = contour_table.SlicesContoured > 0
    column_order = [
        'StructureID',
        'StructureSetID',
        'SeriesDescription',
        'ImageModality',
        'Applicator',
        'PatientID',
        'Source',
        'SlicesContoured',
        'StructureResolution',
        'StructureLabel',
        'CodeScheme',
        'CodeSchemeVersion',
        'ROI_Number',
        'StructureSetUID',
        'ImageSeriesUID',
        'FrameOfReferenceUID',
        'StructureSetIndex',
        'StructureSetFile'
        ]   
    column_selection = [column for column in column_order 
                       if column in contour_table.columns]
    contour_table = contour_table.loc[row_selection, column_selection]
    contour_table.drop_duplicates(inplace=True)
    return contour_table

### Registrations Table
<style type="text/css">
.tg  {border-collapse:collapse;border-spacing:0;}
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  overflow:hidden;padding:10px 5px;word-break:normal;}
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
.tg .tg-vwm9{background-color:#329a9d;color:#000000;font-weight:bold;text-align:center;vertical-align:middle}
.tg .tg-da89{background-color:#96fffb;color:#000000;text-align:left;vertical-align:middle}
.tg .tg-5szv{background-color:#e2efda;color:#000000;font-family:Arial, Helvetica, sans-serif !important;text-align:left;
  vertical-align:middle}
.tg .tg-00be{background-color:#5b9bd5;color:#000000;font-family:Arial, Helvetica, sans-serif !important;font-weight:bold;
  text-align:center;vertical-align:middle}
.tg .tg-uivm{background-color:#ddebf7;color:#000000;font-family:Arial, Helvetica, sans-serif !important;text-align:left;
  vertical-align:middle}
.tg .tg-p9fm{background-color:#70ad47;color:#000000;font-family:Arial, Helvetica, sans-serif !important;font-weight:bold;
  text-align:center;vertical-align:middle}
.tg .tg-ndfp{background-color:#e2efda;color:#000000;text-align:left;vertical-align:middle}
.tg .tg-a350{background-color:#ffc000;color:#000000;font-weight:bold;text-align:center;vertical-align:middle}
.tg .tg-xzj8{background-color:#fff2cc;color:#000000;text-align:left;vertical-align:middle}
</style>
<table class="tg">
<thead>
  <tr>
    <th class="tg-00be" rowspan="3">Index Info</th>
    <th class="tg-uivm">SeriesIndex</th>
  </tr>
  <tr>
    <th class="tg-uivm">PatientID</th>
  </tr>
  <tr>
    <th class="tg-uivm">File</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td class="tg-p9fm" rowspan="3">Registration Info</td>
    <td class="tg-5szv">Type</td>
  </tr>
  <tr>
    <td class="tg-ndfp">Method</td>
  </tr>
  <tr>
    <td class="tg-ndfp">Matrix</td>
  </tr>
  <tr>
    <td class="tg-a350" rowspan="5">From</td>
    <td class="tg-xzj8">Modality</td>
  </tr>
  <tr>
    <td class="tg-xzj8">Applicator</td>
  </tr>
  <tr>
    <td class="tg-xzj8">SeriesIndex</td>
  </tr>
  <tr>
    <td class="tg-xzj8">Series_UID</td>
  </tr>
  <tr>
    <td class="tg-xzj8">FrameOfReference_UID</td>
  </tr>
  <tr>
    <td class="tg-vwm9" rowspan="5">To</td>
    <td class="tg-da89">Modality</td>
  </tr>
  <tr>
    <td class="tg-da89">Applicator</td>
  </tr>
  <tr>
    <td class="tg-da89">SeriesIndex</td>
  </tr>
  <tr>
    <td class="tg-da89">Series_UID</td>
  </tr>
  <tr>
    <td class="tg-da89">FrameOfReference_UID</td>
  </tr>
</tbody>
</table>

In [None]:
def get_referenced_series(reg_dataset: pydicom.Dataset)->Dict[str, str]:
    '''Extract the Image Series UIDs for the image registration

    Takes a DICOM Registration dataset and returns a dictionary with the 
    following two items:
        RegistrationFromSeries_UID
        RegistrationToSeries_UID

    Args:
        reg_dataset (pydicom.Dataset): A DICOM Registration dataset.

    Returns:
        Dict[str, str]: A dictionary with the 'From' and 'To' Image Set 
            Series UIDs.
    '''
    from_ref_series_seq = reg_dataset.get('ReferencedSeriesSequence')
    to_study_seq = reg_dataset.get('StudiesContainingOtherReferencedInstancesSequence')
    to_ref_series_seq = to_study_seq[0].get('ReferencedSeriesSequence')
    referenced_series = {
        'RegistrationFromSeries_UID': from_ref_series_seq[0].get('SeriesInstanceUID'),
        'RegistrationToSeries_UID': to_ref_series_seq[0].get('SeriesInstanceUID')
        }
    return referenced_series

In [None]:
def get_registration_info(reg_dataset: pydicom.Dataset)->Dict[str, str]:
    '''Extract details of an image registration.

    Takes a DICOM Registration dataset and returns a dictionary with the 
    following information:
        - 'From' and 'To' Frame Of Reference UIDs
        - Type of registration e.g. 'RIGID'
        - Registration method e.g. 'Visual Alignment'
        - String representation of the Registration Matrix

    Args:
        reg_dataset (pydicom.Dataset): A DICOM Registration dataset.

    Returns:
        Dict[str, str]: A dictionary containing registration details.
    '''
    reg_seq = reg_dataset.get('RegistrationSequence')
    matrix_reg_seq = reg_seq[1].get('MatrixRegistrationSequence')
    matrix_seq = matrix_reg_seq[0].get('MatrixSequence')
    matrix_reg_code_seq = matrix_reg_seq[0].get('RegistrationTypeCodeSequence')       
    reg_matrix = matrix_seq[0].get('FrameOfReferenceTransformationMatrix')
    reg_matrix_str =  list_to_str(reg_matrix) 
    registration_info = {
        'FromFrameOfReference_UID': reg_seq[0].get('FrameOfReferenceUID'),
        'ToFrameOfReference_UID': reg_seq[1].get('FrameOfReferenceUID'),
        'RegistrationType': matrix_seq[0].get('FrameOfReferenceTransformationMatrixType'),
        'RegistrationMethod': matrix_reg_code_seq[0].get('CodeMeaning'),
        'RegistrationMatrix': reg_matrix_str
        }
    return registration_info


In [None]:
def make_image_set_lookup(image_table: pd.DataFrame)->pd.DataFrame:
    '''Convert the image table into one that can be searched by Series UID.
    
    Note: Eclipse and Oncentra use the same Image Set Series UID, but this table 
    only references the Eclipse image sets.

    Args:
        image_table (pd.DataFrame): Table describing all Image sets

    Returns:
        pd.DataFrame: Table describing all unique image sets, indexed on 
            Series UID.
    '''
    image_set_lookup = image_table.reset_index()
    image_set_lookup.sort_values(['Series_UID', 'Source'])
    image_set_lookup.drop_duplicates('Series_UID', inplace=True)
    image_set_lookup.set_index('Series_UID', inplace=True)
    return image_set_lookup


In [None]:
def get_image_ref(referenced_series: Dict[str, str], 
                  image_set_lookup: pd.DataFrame)->Dict[str, str]:
       '''Get Image Set reference info for an image registration.

       Returns Modality, Applicator, SeriesIndex and StructureSetIndex for 
       'From' and 'To' Image Sets

       Args:
           referenced_series (Dict[str, str]): A dictionary with the following 
            two items:
                RegistrationFromSeries_UID
                RegistrationToSeries_UID
           image_set_lookup (pd.DataFrame): Table describing all unique image 
            sets, indexed on 
            Series UID.
       Returns:
           Dict[str, str]: A dictionary with Modality, Applicator, SeriesIndex 
            and StructureSetIndex info for 'From' and 'To' Image Sets.       
       '''
       from_series_uid = referenced_series['RegistrationFromSeries_UID']
       from_series = image_set_lookup.loc[from_series_uid,:]
       
       to_series_uid = referenced_series['RegistrationToSeries_UID']
       to_series = image_set_lookup.loc[to_series_uid,:]
       
       series_info = {
              'FromModality': from_series.Modality,
              'ToModality': to_series.Modality,
              'FromApplicator': from_series.Applicator,
              'ToApplicator': to_series.Applicator,              
              'FromSeriesIndex': from_series.SeriesIndex,
              'ToSeriesIndex': to_series.SeriesIndex,              
              'FromStructureSetIndex': from_series.StructureSetIndex,
              'ToStructureSetIndex': to_series.StructureSetIndex
              }
       return series_info

In [None]:
def build_reg_table(dicom_ref: pd.DataFrame, 
                      image_set_lookup: pd.DataFrame, 
                      top_dir: Path)->pd.DataFrame:
    '''Build a table describing all Image sets.

    Args:
        dicom_ref (pd.DataFrame): Reference data, including aggregate info for 
            all DICOM files.
        structure_set_lookup (pd.DataFrame): Table of Structure sets to be 
            linked to image sets
        top_dir (Path): The path to a top level folder containing DICOM files.

    Returns:
        pd.DataFrame: Table describing all Image sets
    '''
    reg_mask = dicom_ref.Modality.isin(['REG'])
    reg_ref = dicom_ref.loc[reg_mask, :]
    
    
    reg_list = []
    for row in reg_ref.itertuples(index=False):
        reg_file_path = row.NewPath
        dataset = pydicom.dcmread(reg_file_path)
        # Build an index to the series in the form: StudyID.SeriesNumber
        series_ref = '.'.join([
            str(dataset.get('StudyID','')), 
            str(dataset.get('SeriesNumber',''))
            ])
        # Generate the relative path to the registration file.
        relative_path = make_relative_path(reg_file_path, top_dir)
        # Begin building the table elements
        reg_dict = {
            'SeriesIndex': series_ref,
            'PatientID': row.PatientID,
            'File': relative_path
            }    
        reg_info = get_registration_info(dataset)
        reg_dict.update(reg_info)
        referenced_series = get_referenced_series(dataset)
        series_info = get_image_ref(referenced_series, image_set_lookup)
        reg_dict.update(series_info)
        reg_dict.update(referenced_series)
        reg_list.append(reg_dict)
    reg_table = pd.DataFrame(reg_list)
    #image_table.drop_duplicates(inplace=True)
    #image_table.set_index('SeriesIndex', inplace=True)
    #image_table.fillna('', inplace=True)
    return reg_table

In [None]:
# Extract information required for second cleaning
dicom_files = initial_scan(working_dir)
dicom_ref = extract_reference_info(dicom_files, cleaned_dir)

In [None]:
# Perform second cleaning and build new directory structure
#dicom_mod(dicom_ref)

In [None]:
# Build the image set table and link images with structure sets
structure_set_lookup = build_structure_set_lookup(dicom_ref, top_dir)
image_table = build_image_table(dicom_ref, structure_set_lookup, top_dir)

In [None]:
# Build the contours table
contour_table = scan_structure_files(structure_set_lookup, image_table)

In [None]:
# Build the Image Registrations table
image_set_lookup = make_image_set_lookup(image_table)
reg_table = build_reg_table(dicom_ref, image_set_lookup, top_dir)

In [None]:
wb = xw.Book()
wb.save(save_file)

dicom_ref_for_excel = dicom_ref.drop(columns=['NewPath', 'FilePath'])
structure_set_lookup_for_excel = structure_set_lookup.drop(columns=['FilePath'])

active_sheet = wb.sheets.add('Files')
xw.view(dicom_ref_for_excel, sheet=active_sheet)

active_sheet = wb.sheets.add('Structure Sets')
xw.view(structure_set_lookup_for_excel, sheet=active_sheet)

active_sheet = wb.sheets.add('Contours')
xw.view(contour_table, sheet=active_sheet)

active_sheet = wb.sheets.add('Registration Links')
xw.view(reg_table, sheet=active_sheet)

active_sheet = wb.sheets.add('Image Info')
xw.view(image_table, sheet=active_sheet)

wb.save()