<a href="https://colab.research.google.com/github/ImagingDataCommons/Cloud-Resources-Workflows/blob/notebooks2/Notebooks/Totalsegmentator/PostProcessingTerra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##**This notebook can process the data generated by the twoVmWorkflowOnTerra and push to Google Cloud storage buckets.**

The steps are:
- The lz4 compressed DICOM SEG and SR files are downloaded from the Terra Workspace bucket
- They are decompressed by lz4 and moved to a temporary directory
- The uncompressed DICOM SEG and SR files are then pushed to Google Cloud Storage buckets, which can then be imported by DICOM store.
- The process is iterated for each batch

##**Install lz4**

In [None]:
%%capture
!sudo apt-get update \
  && apt-get install -y --no-install-recommends \
    lz4\
  && rm -rf /var/lib/apt/lists/*

##**Authenticate gcloud**

In [None]:
project_id='my-test-project'

In [None]:
!gcloud auth login

In [None]:
!gcloud config set project $project_id

##**Import packages**

In [None]:
import pandas as pd
import shutil
import os
from tqdm import tqdm
import traceback

##**Load the terra table containing links to artifacts generated by the twoVmWorkflow**

In [None]:
data= pd.read_table('twoVM_2023_06_10_00_22.tsv')
data

##**Decompress DICOM SEG files and push to Cloud Storage buckets**

In [None]:
dicom_seg_download_urls=data['dicomsegAndRadiomicsSR_CompressedFiles'].to_list()
dicom_seg_download_urls

In [None]:
destination_bucket_name='total_segmentator_nlst_sample_061023'

In [None]:
for url in tqdm(dicom_seg_download_urls):
    print('processing url:'+url)
    try:
      shutil.rmtree(f'dicom_seg_objects')
      shutil.rmtree(f'itkimage2segimage')
    except OSError:
      pass
    try:
        os.mkdir(f'dicom_seg_objects')
        !gsutil cp {url} . > /dev/null 2>&1
        !lz4 -d --rm dicomsegAndRadiomicsSR_DICOMsegFiles.tar.lz4 -c | tar --strip-components=1  -xvf - > /dev/null 2>&1
        !find ./itkimage2segimage -name '*.dcm.lz4' -exec mv -t dicom_seg_objects {} + > /dev/null 2>&1
        !lz4 -d -m --rm "dicom_seg_objects"/*.lz4 > /dev/null 2>&1
        !gsutil -m cp -r dicom_seg_objects/* gs://$destination_bucket_name/DICOM_SEGS/ > /dev/null 2>&1
    except Exception as e:
        print(f'Error processing {url}: {e}')
        traceback.print_exc()
shutil.rmtree(f'dicom_seg_objects')
shutil.rmtree(f'itkimage2segimage')

##**Decompress DICOM SR files and push to Cloud Storage buckets**

In [None]:
sr_download_urls=data['structuredReportsDICOM'].to_list()

In [None]:
series_df = pd.DataFrame(columns=['series_id'])

batch_count = 1  # Counter for batch folders

for url in tqdm(sr_download_urls):
    print('processing url:' + url)
    try:
        shutil.rmtree(f'structuredReportsDICOM')
        shutil.rmtree(f'decompressedStructuredReportsDICOM')
    except OSError:
        pass
    os.mkdir(f'decompressedStructuredReportsDICOM')
    try:
        !gsutil cp {url} . > /dev/null 2>&1
        !lz4 -d --rm structuredReportsDICOM.tar.lz4 -c | tar --strip-components=1 -xvf - > /dev/null 2>&1
        !find ./structuredReportsDICOM -name '*.dcm.lz4' -exec mv -t decompressedStructuredReportsDICOM {} + > /dev/null 2>&1
        !lz4 -d -m --rm "decompressedStructuredReportsDICOM"/*.lz4 > /dev/null 2>&1
        !gsutil -m cp -r decompressedStructuredReportsDICOM/* gs://$destination_bucket_name/decompressedStructuredReportsDICOM/batch_{batch_count}/ > /dev/null 2>&1

        # Find all series IDs and add them to the DataFrame
        series_ids = [filename.split('_')[0] for filename in os.listdir('decompressedStructuredReportsDICOM')]
        url_series_df = pd.DataFrame({'series_id': series_ids})

        # Append the current DataFrame to the main DataFrame
        series_df = pd.concat([series_df, url_series_df], ignore_index=True)

    except Exception as e:
        print(f'Error processing {url}: {e}')
        traceback.print_exc()

#the below steps are for importing dicom files to dicom store from command line.
    # try:
    #     # Upload shape_sr.dcm files in the batch
    #     gcs_uri = f"gs://$destination_bucket_name/decompressedStructuredReportsDICOM/batch_{batch_count}/*.dcm"
    #     !gcloud healthcare dicom-stores import gcs 10k-series --dataset=total_segmentator_nlst_sample_061023 --location=us-central1 --gcs-uri={gcs_uri}
    # except Exception as e:
    #     print(f'Error processing {url}: {e}')
    #     traceback.print_exc()

    # Increment the batch counter
    batch_count += 1


In [None]:
shutil.rmtree(f'structuredReportsDICOM')