<a href="https://colab.research.google.com/github/ImagingDataCommons/Cloud-Resources-Workflows/blob/notebooks2/Notebooks/Totalsegmentator/postProcessingExtractPerframe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**This notebook extracts the DICOM attribute PerFrameFunctionalGroupsSequence from DICOM SEG Objects. It takes destinations of the SEG files, downloads them, decompress, and extract the DICOM attribute and flattens it**

###**Note: Since we cannot create service accounts on Terra GCP project, files are first copied (only possible using the credentials when logged in interactively with gcloud auth login) from Terra's bucket to another bucket in a project where we can create service accounts.**

###**Installing Packages**

In [None]:
# %%capture
# !sudo apt-get update \
#   && apt-get install -y --no-install-recommends \
#   lz4

In [None]:
# %%capture
# !pip install pydicom \
#    google-cloud-bigquery \
#    pyarrow \
#    db_dtypes

###**Importing Packages**

In [None]:
import os
import shutil
import pandas as pd
import pydicom
import traceback
import logging
from tqdm import tqdm
import subprocess
import pydicom
from tqdm import tqdm

###**Parameters for papermill**

In [None]:
segFilesCsv=''
jsonServiceAccountFile=''

###**Local Testing**                                                 
While testing locally uncomment the below cell but comment out all cells after that

In [None]:
# !wget -q https://github.com/ImagingDataCommons/Cloud-Resources-Workflows/raw/notebooks/sampleManifests/dicomsegAndRadiomicsSR_DICOMsegFiles.tar.lz4
# combined_data = []
# try:
#         try:
#             shutil.rmtree('itkimage2segimage')
#             shutil.rmtree('decompressedSegmentationsDICOM')
#         except OSError:
#             pass

#         os.mkdir('decompressedSegmentationsDICOM')
#         # Download and process the data
#         try:
#             # Download and decompress data
#             #!gsutil cp {url} . > /dev/null 2>&1
#             !lz4 -d --rm dicomsegAndRadiomicsSR_DICOMsegFiles.tar.lz4 -c | tar --strip-components=1 -xvf - > /dev/null 2>&1
#             !find ./itkimage2segimage -name '*.dcm.lz4' -exec mv -t decompressedSegmentationsDICOM {} + > /dev/null 2>&1
#             !lz4 -d -m --rm "decompressedSegmentationsDICOM"/*.lz4 > /dev/null 2>&1
#             print('files successfully decompressed')
#         except Exception as download_error:
#             logging.error('Error during download and decompression: %s', str(download_error))
#             #continue  # Skip this URL and continue to the next

#         # Find all series IDs and add them to the combined_data list
#         series_ids = [filename.split('_')[0] for filename in os.listdir('decompressedSegmentationsDICOM')]

#         for series_id in series_ids:
#             pffgs = pydicom.dcmread(f'decompressedSegmentationsDICOM/{series_id}', specific_tags=["ReferencedSeriesSequence","PerFrameFunctionalGroupsSequence"], stop_before_pixels=True)
#             referencedSeriesInstanceUID = pffgs.ReferencedSeriesSequence[0].SeriesInstanceUID
#             data = []

#             # Extract data from Per-frame Functional Groups Sequence
#             if "PerFrameFunctionalGroupsSequence" in pffgs:
#                 for item in pffgs.PerFrameFunctionalGroupsSequence:
#                     frame_data = {
#                         'ReferencedSeriesSequence_SeriesInstanceUID': referencedSeriesInstanceUID,
#                         'FrameContentSequence_DimensionIndexValues': [str(s) for s in list(item.FrameContentSequence[0].DimensionIndexValues)],
#                         'PlanePositionSequence_ImagePositionPatient': [str(s) for s in list(item.PlanePositionSequence[0].ImagePositionPatient)],
#                         'SegmentIdentificationSequence_ReferencedSegmentNumber': item.SegmentIdentificationSequence[0].ReferencedSegmentNumber
#                     }

#                     # Extract attributes from Derivation Image Sequence
#                     derivation_image_sequence = item.DerivationImageSequence
#                     if derivation_image_sequence:
#                         source_image_sequence = derivation_image_sequence[0].SourceImageSequence
#                         if source_image_sequence:
#                             frame_data['DerivationImageSequence_SourceImageSequence_ReferencedSOPClassUID'] = source_image_sequence[0].ReferencedSOPClassUID
#                             frame_data['DerivationImageSequence_SourceImageSequence_ReferencedSOPInstanceUID'] = source_image_sequence[0].ReferencedSOPInstanceUID
#                             purpose_of_reference_code_sequence = source_image_sequence[0].PurposeOfReferenceCodeSequence
#                             if purpose_of_reference_code_sequence:
#                                 frame_data['DerivationImageSequence_SourceImageSequence_PurposeOfReferenceCodeSequence_CodeValue'] = purpose_of_reference_code_sequence[0].CodeValue
#                                 frame_data['DerivationImageSequence_SourceImageSequence_PurposeOfReferenceCodeSequence_CodingSchemeDesignator'] = purpose_of_reference_code_sequence[0].CodingSchemeDesignator
#                                 frame_data['DerivationImageSequence_SourceImageSequence_PurposeOfReferenceCodeSequence_CodeMeaning'] = purpose_of_reference_code_sequence[0].CodeMeaning

#                     # Extract attributes from Derivation Code Sequence
#                     derivation_code_sequence = derivation_image_sequence[0].DerivationCodeSequence
#                     if derivation_code_sequence:
#                         frame_data['DerivationImageSequence_DerivationCodeSequence_CodeValue'] = derivation_code_sequence[0].CodeValue
#                         frame_data['DerivationImageSequence_DerivationCodeSequence_CodingSchemeDesignator'] = derivation_code_sequence[0].CodingSchemeDesignator
#                         frame_data['DerivationImageSequence_DerivationCodeSequence_CodeMeaning'] = derivation_code_sequence[0].CodeMeaning

#                     data.append(frame_data)

#             # Add data from this series to the combined_data list
#             combined_data.extend(data)

# except Exception as e:
#     logging.error('An error occurred: %s', str(e))

# finally:
#     # Create a DataFrame from the combined_data list
#     df = pd.DataFrame(combined_data)

#     # Generate a single CSV file for all the data
#     csv_filename = 'perFrameFunctionalGroupSequence.csv'
#     df.to_csv(csv_filename, index=False)
#     !lz4 --rm 'perFrameFunctionalGroupSequence.csv' 'perFrameFunctionalGroupSequence.csv.lz4'

#     logging.info('Processing complete.')

###**Authenticate with service account so we can download files from a non Terra bucket**

In [None]:
!gcloud auth activate-service-account --key-file={jsonServiceAccountFile}

In [None]:
data= pd.read_csv(segFilesCsv)
data

###**Extract PerFrameFunctionalGroupsSequence**

In [None]:
# Initialize logging
logging.basicConfig(filename="console_output.txt", level=logging.INFO)

# Create an output directory to store CSV
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

seg_download_urls = data['new_destination'].to_list()

In [None]:
# Initialize an empty list to store data from all URLs
combined_data = []

try:
    for url in tqdm(seg_download_urls):
        logging.info('Processing URL: %s', url)

        # Clean up previous data if necessary
        try:
            shutil.rmtree('itkimage2segimage')
            shutil.rmtree('decompressedSegmentationsDICOM')
        except OSError:
            pass

        os.mkdir('decompressedSegmentationsDICOM')
        # Download and process the data
        try:
            # Download and decompress data
            !gsutil cp {url} . > /dev/null 2>&1
            !lz4 -d --rm dicomsegAndRadiomicsSR_DICOMsegFiles.tar.lz4 -c | tar --strip-components=1 -xvf - > /dev/null 2>&1
            !find ./itkimage2segimage -name '*.dcm.lz4' -exec mv -t decompressedSegmentationsDICOM {} + > /dev/null 2>&1
            !lz4 -d -m --rm "decompressedSegmentationsDICOM"/*.lz4 > /dev/null 2>&1
            print('files successfully decompressed')
        except Exception as download_error:
            logging.error('Error during download and decompression: %s', str(download_error))
            continue  # Skip this URL and continue to the next

        # Find all series IDs and add them to the combined_data list
        series_ids = [filename.split('_')[0] for filename in os.listdir('decompressedSegmentationsDICOM')]

        for series_id in series_ids:
            pffgs = pydicom.dcmread(f'decompressedSegmentationsDICOM/{series_id}', specific_tags=["ReferencedSeriesSequence","PerFrameFunctionalGroupsSequence"], stop_before_pixels=True)
            referencedSeriesInstanceUID = pffgs.ReferencedSeriesSequence[0].SeriesInstanceUID
            data = []

            # Extract data from Per-frame Functional Groups Sequence
            if "PerFrameFunctionalGroupsSequence" in pffgs:
                for item in pffgs.PerFrameFunctionalGroupsSequence:
                    frame_data = {
                        'ReferencedSeriesSequence_SeriesInstanceUID': referencedSeriesInstanceUID,
                        'FrameContentSequence_DimensionIndexValues': [str(s) for s in list(item.FrameContentSequence[0].DimensionIndexValues)],
                        'PlanePositionSequence_ImagePositionPatient': [str(s) for s in list(item.PlanePositionSequence[0].ImagePositionPatient)],
                        'SegmentIdentificationSequence_ReferencedSegmentNumber': item.SegmentIdentificationSequence[0].ReferencedSegmentNumber
                    }

                    # Extract attributes from Derivation Image Sequence
                    derivation_image_sequence = item.DerivationImageSequence
                    if derivation_image_sequence:
                        source_image_sequence = derivation_image_sequence[0].SourceImageSequence
                        if source_image_sequence:
                            frame_data['DerivationImageSequence_SourceImageSequence_ReferencedSOPClassUID'] = source_image_sequence[0].ReferencedSOPClassUID
                            frame_data['DerivationImageSequence_SourceImageSequence_ReferencedSOPInstanceUID'] = source_image_sequence[0].ReferencedSOPInstanceUID
                            purpose_of_reference_code_sequence = source_image_sequence[0].PurposeOfReferenceCodeSequence
                            if purpose_of_reference_code_sequence:
                                frame_data['DerivationImageSequence_SourceImageSequence_PurposeOfReferenceCodeSequence_CodeValue'] = purpose_of_reference_code_sequence[0].CodeValue
                                frame_data['DerivationImageSequence_SourceImageSequence_PurposeOfReferenceCodeSequence_CodingSchemeDesignator'] = purpose_of_reference_code_sequence[0].CodingSchemeDesignator
                                frame_data['DerivationImageSequence_SourceImageSequence_PurposeOfReferenceCodeSequence_CodeMeaning'] = purpose_of_reference_code_sequence[0].CodeMeaning

                    # Extract attributes from Derivation Code Sequence
                    derivation_code_sequence = derivation_image_sequence[0].DerivationCodeSequence
                    if derivation_code_sequence:
                        frame_data['DerivationImageSequence_DerivationCodeSequence_CodeValue'] = derivation_code_sequence[0].CodeValue
                        frame_data['DerivationImageSequence_DerivationCodeSequence_CodingSchemeDesignator'] = derivation_code_sequence[0].CodingSchemeDesignator
                        frame_data['DerivationImageSequence_DerivationCodeSequence_CodeMeaning'] = derivation_code_sequence[0].CodeMeaning

                    data.append(frame_data)

            # Add data from this series to the combined_data list
            combined_data.extend(data)

except Exception as e:
    logging.error('An error occurred: %s', str(e))

finally:
    # Create a DataFrame from the combined_data list
    df = pd.DataFrame(combined_data)

    # Generate a single CSV file for all the data
    csv_filename = 'perFrameFunctionalGroupSequence.csv'
    df.to_csv(csv_filename, index=False)
    !lz4 --rm 'perFrameFunctionalGroupSequence.csv' 'perFrameFunctionalGroupSequence.csv.lz4'

    logging.info('Processing complete.')
