# About

# Prerequisites

In [1]:
my_project_id = "idc-pathomics-000"

In [5]:
# Import all required python libraries for this use case
import os
import pandas as pd

!pip install pydata_google_auth
import pydata_google_auth
from google.cloud.bigquery import magics 



In [3]:
!gcloud auth list
!gcloud config set account dschacherer.fme@gmail.com
!gcloud auth list

                   Credentialed Accounts
ACTIVE  ACCOUNT
*       1020787117084-compute@developer.gserviceaccount.com
        dschacherer.fme@gmail.com

To set the active account, run:
    $ gcloud config set account `ACCOUNT`

Updated property [core/account].
                   Credentialed Accounts
ACTIVE  ACCOUNT
        1020787117084-compute@developer.gserviceaccount.com
*       dschacherer.fme@gmail.com

To set the active account, run:
    $ gcloud config set account `ACCOUNT`



In [6]:
# need to get the right scope here: https://developers.google.com/identity/protocols/oauth2/scopes, pydata Documentation: https://pydata-google-auth.readthedocs.io/_/downloads/en/latest/pdf/
credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/devstorage.full_control'],)
magics.context.credentials = credentials # credentials will be used later for BiqQuery queries

In [4]:
# BigQuery Authentification: https://cloud.google.com/docs/authentication/getting-started --> umgebungsvariable festlegen 
# https://medium.com/john-lewis-software-engineering/authenticating-jupyter-notebook-against-bigquery-957884f78527
# from command line
#!gcloud auth application-default login --yes
# when using google.colab
#from google.colab import auth 
#auth.authenticate_user()

# Environment setup

In [5]:
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print("Current directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

Current directory : /home/jupyter/idc-pathomics-use-case-1/src
Hostname          : idc-patho-vm
Username          : jupyter


In [12]:
!sudo apt-get update
!sudo apt-get install --no-install-recommends -y python3-openslide

Hit:1 http://deb.debian.org/debian buster InRelease
Hit:2 http://security.debian.org/debian-security buster/updates InRelease
Hit:3 http://deb.debian.org/debian buster-updates InRelease                    
Hit:4 http://deb.debian.org/debian buster-backports InRelease                  
Hit:5 https://nvidia.github.io/libnvidia-container/stable/debian10/amd64  InRelease
Hit:6 https://nvidia.github.io/nvidia-container-runtime/stable/debian10/amd64  InRelease
Hit:7 https://download.docker.com/linux/debian buster InRelease                
Hit:8 https://nvidia.github.io/nvidia-docker/debian10/amd64  InRelease
Hit:10 http://packages.cloud.google.com/apt cloud-sdk-buster InRelease
Hit:11 https://packages.cloud.google.com/apt google-fast-socket InRelease
Hit:12 http://packages.cloud.google.com/apt google-cloud-packages-archive-keyring-buster InRelease
Hit:13 http://packages.cloud.google.com/apt gcsfuse-buster InRelease
Hit:9 https://packages.cloud.google.com/apt kubernetes-xenial InRelease
Hit:1

In [13]:
# Install other requirements or are these already available? Check!
!sudo pip3 install -r ../requirements.txt



In [14]:
# otherwise Openslide can not be loaded. 
import sys
sys.path.append('/usr/local/lib/python3.7/dist-packages')

# Dataset selection and exploration using BiqQuery

In [18]:
# Explanation where Attributes are obtained from etc. pp., explain BigQuery magic, alternative: BigQuery client see https://cloud.google.com/bigquery/docs/visualize-jupyter?hl=de#pip

In [7]:
%%bigquery cohort_df --project=$my_project_id 

SELECT
    b.ContainerIdentifier AS slide_id,
    b.PatientID AS patient_id,
    b.ClinicalTrialProtocolID AS tumor_subtype,
    b.TotalPixelMatrixColumns AS width,
    b.TotalPixelMatrixRows AS height,
    b.gcs_url
FROM
    -- ContainerIdentifier is not unique if slide was scanned twice.
    -- Therefore, identify slides by both ContainerIdentifier and FrameOfReferenceUID.
    (
        SELECT
            ContainerIdentifier,
            FrameOfReferenceUID,
            MAX(TotalPixelMatrixColumns * TotalPixelMatrixRows) AS max_size
        FROM idc-dev-etl.idc_v3.dicom_metadata
        WHERE
            NOT (ContainerIdentifier IS NULL)
            AND (ClinicalTrialProtocolID = "CPTAC-LUAD"
                OR ClinicalTrialProtocolID = "CPTAC-LSCC")
        GROUP BY ContainerIdentifier, FrameOfReferenceUID 
    ) AS a
    JOIN idc-dev-etl.idc_v3.dicom_all AS b ON
        b.ContainerIdentifier = a.ContainerIdentifier
        AND b.FrameOfReferenceUID = a.FrameOfReferenceUID
WHERE a.max_size = b.TotalPixelMatrixColumns * b.TotalPixelMatrixRows

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 1078.50query/s]
Downloading: 100%|██████████| 2218/2218 [00:01<00:00, 1252.47rows/s]


In [20]:
print(cohort_df.head(), len(cohort_df))

       slide_id patient_id tumor_subtype  width  height  \
0  C3L-04784-27  C3L-04784    CPTAC-LUAD  73704   34427   
1  C3N-04457-21  C3N-04457    CPTAC-LSCC  73704   33679   
2  C3L-02627-27  C3L-02627    CPTAC-LSCC  69719   22057   
3  C3N-02434-26  C3N-02434    CPTAC-LSCC  51791   21419   
4  C3N-02285-30  C3N-02285    CPTAC-LSCC  19919   21038   

                                             gcs_url  
0  gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302...  
1  gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c...  
2  gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6...  
3  gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa...  
4  gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414...   2218


In [21]:
cohort_df.to_csv('/home/jupyter/idc_input/cohort.csv', index=False)

In [25]:
tissue_type_data = pd.read_csv('/home/jupyter/idc_input/CPTAC_LUAD-LSCC_metadata_from_TCIA.csv')

In [27]:
def add_tissue_type_information(cohort_df, tissue_type_data):
    tissue_types = []
    for i, row in cohort_df.iterrows():
        slide_id = row['slide_id']
        try: 
            tissue_type = tissue_type_data[tissue_type_data['Slide_ID'] == slide_id]['Specimen_Type'].item()
            tissue_types.append(tissue_type)
        except: 
            cohort_df = cohort_df.drop(index=i)
    complete_df = _add_column_to_dataframe(cohort_df, tissue_types)
    
    # Replace certain column values for clarity
    complete_df.replace({'tumor_subtype': 'CPTAC-LSCC'}, 'LSCC', inplace=True)
    complete_df.replace({'tumor_subtype': 'CPTAC-LUAD'}, 'LUAD', inplace=True)
    complete_df.replace({'tissue_type': 'normal_tissue'}, 'normal', inplace=True)
    complete_df.replace({'tissue_type': 'tumor_tissue'}, 'tumor', inplace=True)
    return complete_df

def _add_column_to_dataframe(dataframe, column): 
    assert len(dataframe) == len(column), 'Number of new column values not matching length of dataframe.'
    dataframe.insert(3, 'tissue_type', column)
    return dataframe    

In [28]:
slides_metadata = add_tissue_type_information(cohort_df, tissue_type_data)
slides_metadata.to_csv('/home/jupyter/idc_input/slides_metadata.csv', index=False)

In [29]:
slides_metadata.head()

Unnamed: 0,slide_id,patient_id,tumor_subtype,tissue_type,width,height,gcs_url
1,C3N-04457-21,C3N-04457,LSCC,tumor,73704,33679,gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c...
2,C3L-02627-27,C3L-02627,LSCC,normal,69719,22057,gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6...
3,C3N-02434-26,C3N-02434,LSCC,normal,51791,21419,gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa...
4,C3N-02285-30,C3N-02285,LSCC,normal,19919,21038,gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414...
5,C3N-03441-21,C3N-03441,LSCC,tumor,59759,19141,gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43...


In [None]:
# IDC viewer for some pathomics data? 
def get_idc_viewer_url(study_UID):
    return "https://viewer.imaging.datacommons.cancer.gov/viewer/" + study_UID

# Temporary Data Download 

In [8]:
cohort_df['gcs_url'][:10].to_csv('/home/jupyter/gcs_paths.txt', header=False, index=False)
!cat /home/jupyter/gcs_paths.txt

gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302-1c432c69a653.dcm
gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c-59390f960a63.dcm
gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6-fe6e63daff42.dcm
gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa-143ee31f0647.dcm
gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414-093f049d3f3f.dcm
gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43-a07faa6a9251.dcm
gs://idc_v3_cptac_lscc/a9f76626-12e0-4a91-9ed4-e89f5e418c26.dcm
gs://idc_v3_cptac_luad/264ab7ce-a9d0-41a1-8c0d-faf19f013232.dcm
gs://idc_v3_cptac_luad/e08b9910-bdb8-4829-8048-e04e295c4780.dcm
gs://idc_v3_cptac_luad/8324067b-7036-46b6-ab23-5755437da523.dcm


In [9]:
!mkdir /home/jupyter/downloaded_cohort
!cat /home/jupyter/gcs_paths.txt | gsutil -u $my_project_id -m cp -I /home/jupyter/downloaded_cohort
# make use of dicomsort??? 

mkdir: cannot create directory ‘/home/jupyter/downloaded_cohort’: File exists
Copying gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302-1c432c69a653.dcm...
Copying gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c-59390f960a63.dcm...      
Copying gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6-fe6e63daff42.dcm...      
Copying gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa-143ee31f0647.dcm...      
Copying gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414-093f049d3f3f.dcm...      
Copying gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43-a07faa6a9251.dcm...      
Copying gs://idc_v3_cptac_lscc/a9f76626-12e0-4a91-9ed4-e89f5e418c26.dcm...      
Copying gs://idc_v3_cptac_luad/264ab7ce-a9d0-41a1-8c0d-faf19f013232.dcm...      
Copying gs://idc_v3_cptac_luad/e08b9910-bdb8-4829-8048-e04e295c4780.dcm...      
Copying gs://idc_v3_cptac_luad/8324067b-7036-46b6-ab23-5755437da523.dcm...
^C[2/10 files][  1.2 GiB/  3.0 GiB]  38% Done                                   


# Pathomics use case
## Preprocessing

In [16]:
#input_dir = os.environ['IDC_INPUT_DATA_DIR']
input_dir = '/home/jupyter/idc_input/'
slides_dir = os.path.join(input_dir, 'cptac_slides')
tiles_dir = os.path.join(input_dir, 'cptac_tiles')
#mkdir respective folders? and afterwards delete slide folder, because it is empty?  

In [17]:
# SLOW :( 
from data.tile_generation_cptac import generate_tiles

generate_tiles(slides_dir, os.path.join(input_dir, 'slides_metadata.csv'), tiles_dir, 'idc-pathomics-000')

Reading input data from /home/jupyter/idc_input/cptac_slides
Slide C3N-04457-21 already tiled
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6-fe6e63daff42.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6-fe6e63daff42.dcm...
- [1 files][236.6 MiB/236.6 MiB]                                                
Operation completed over 1 objects/236.6 MiB.                                    


Processing: C3L-02627-27
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa-143ee31f0647.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa-143ee31f0647.dcm...
| [1 files][144.9 MiB/144.9 MiB]                                                
Operation completed over 1 objects/144.9 MiB.                                    


Processing: C3N-02434-26
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414-093f049d3f3f.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414-093f049d3f3f.dcm...
- [1 files][ 64.6 MiB/ 64.6 MiB]                                                
Operation completed over 1 objects/64.6 MiB.                                     


Processing: C3N-02285-30
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43-a07faa6a9251.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43-a07faa6a9251.dcm...
\ [1 files][129.1 MiB/129.1 MiB]                                                
Operation completed over 1 objects/129.1 MiB.                                    


Processing: C3N-03441-21
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_lscc/a9f76626-12e0-4a91-9ed4-e89f5e418c26.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_lscc/a9f76626-12e0-4a91-9ed4-e89f5e418c26.dcm...
- [1 files][531.5 MiB/531.5 MiB]                                                
Operation completed over 1 objects/531.5 MiB.                                    


Processing: C3L-05423-24
19877 19877
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_luad/264ab7ce-a9d0-41a1-8c0d-faf19f013232.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_luad/264ab7ce-a9d0-41a1-8c0d-faf19f013232.dcm...
| [1 files][934.1 MiB/934.1 MiB]   60.7 MiB/s                                   
Operation completed over 1 objects/934.1 MiB.                                    


Processing: C3N-02920-23
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_luad/e08b9910-bdb8-4829-8048-e04e295c4780.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_luad/e08b9910-bdb8-4829-8048-e04e295c4780.dcm...
\ [1 files][117.4 MiB/117.4 MiB]                                                
Operation completed over 1 objects/117.4 MiB.                                    


Processing: C3N-03063-22
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_luad/8324067b-7036-46b6-ab23-5755437da523.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_luad/8324067b-7036-46b6-ab23-5755437da523.dcm...
/ [1 files][ 12.1 MiB/ 12.1 MiB]                                                
Operation completed over 1 objects/12.1 MiB.                                     


Processing: C3L-03262-22
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_luad/4b4881d1-3b22-4f31-a373-b623e5b76a12.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_luad/4b4881d1-3b22-4f31-a373-b623e5b76a12.dcm...
/ [1 files][180.4 MiB/180.4 MiB]                                                
Operation completed over 1 objects/180.4 MiB.                                    


Processing: C3N-02423-23
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_luad/c2fac6cd-ed5f-4910-a776-71f161b5034d.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_luad/c2fac6cd-ed5f-4910-a776-71f161b5034d.dcm...
\ [1 files][538.2 MiB/538.2 MiB]                                                
Operation completed over 1 objects/538.2 MiB.                                    


Processing: C3N-02919-26
20235 20235
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_lscc/c0c552bb-8bbb-47c3-9203-3b1b008a0e93.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_lscc/c0c552bb-8bbb-47c3-9203-3b1b008a0e93.dcm...
| [1 files][347.7 MiB/347.7 MiB]                                                
Operation completed over 1 objects/347.7 MiB.                                    


Processing: C3L-05417-27
19877 19877
['gsutil -u idc-pathomics-000  cp gs://idc_v3_cptac_luad/3189c082-60e5-4d56-906b-4365b13fbd6d.dcm /home/jupyter/idc_input/cptac_slides']


Copying gs://idc_v3_cptac_luad/3189c082-60e5-4d56-906b-4365b13fbd6d.dcm...
/ [1 files][470.4 MiB/470.4 MiB]                                                
Operation completed over 1 objects/470.4 MiB.                                    


Processing: C3N-02155-26
20235 20235


KeyboardInterrupt: 

In [None]:
from data.tile_sorting_cptac import sort_tiles

#metadata_file = os.path.join(input_dir, 'metadata.cart.2017-03-02T00_36_30.276824.json')
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_cancer', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'luad_lusc', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_luad_lusc', magnification=5.0)

## Training
## Evaluation