# About

In [1]:
%load_ext autoreload
%autoreload 2

# Prerequisites

In [2]:
my_project_id = "idc-pathomics-000"

In [3]:
# Import all required python libraries for this use case
import os
import pandas as pd

In [4]:
# BigQuery Authentification: https://cloud.google.com/docs/authentication/getting-started --> umgebungsvariable festlegen 
# https://medium.com/john-lewis-software-engineering/authenticating-jupyter-notebook-against-bigquery-957884f78527
# from command line
#!gcloud auth application-default login --yes
# when using google.colab
#from google.colab import auth 
#auth.authenticate_user()

#!gcloud auth list
#!gcloud config set account dschacherer.fme@gmail.com
#!gcloud auth list

# need to get the right scope here: https://developers.google.com/identity/protocols/oauth2/scopes, pydata Documentation: https://pydata-google-auth.readthedocs.io/_/downloads/en/latest/pdf/
#credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/devstorage.full_control'],)
#magics.context.credentials = credentials # credentials will be used later for BiqQuery queries

# Environment setup

In [5]:
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print("Current directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

Current directory : /home/jupyter/idc-pathomics-use-case-1/src
Hostname          : idc-patho-vm
Username          : jupyter


In [6]:
!sudo apt-get update
!sudo apt-get install --no-install-recommends -y python3-openslide

Hit:1 http://deb.debian.org/debian buster InRelease
Hit:2 http://deb.debian.org/debian buster-updates InRelease                    
Hit:3 http://security.debian.org/debian-security buster/updates InRelease      
Hit:4 http://deb.debian.org/debian buster-backports InRelease                  
Hit:5 https://nvidia.github.io/libnvidia-container/stable/debian10/amd64  InRelease
Hit:6 https://download.docker.com/linux/debian buster InRelease                
Hit:7 https://nvidia.github.io/nvidia-container-runtime/stable/debian10/amd64  InRelease
Hit:8 https://nvidia.github.io/nvidia-docker/debian10/amd64  InRelease         
Hit:9 http://packages.cloud.google.com/apt cloud-sdk-buster InRelease
Hit:11 https://packages.cloud.google.com/apt google-fast-socket InRelease
Hit:12 http://packages.cloud.google.com/apt google-cloud-packages-archive-keyring-buster InRelease
Hit:10 https://packages.cloud.google.com/apt kubernetes-xenial InRelease
Hit:13 http://packages.cloud.google.com/apt gcsfuse-buster 

In [7]:
# Install other requirements or are these already available? Check!
!sudo pip3 install -r ../requirements.txt



In [8]:
# otherwise Openslide can not be loaded. 
import sys
sys.path.append('/usr/local/lib/python3.7/dist-packages')

# Dataset selection and exploration using BiqQuery

In [26]:
# Explanation where Attributes are obtained from etc. pp., explain BigQuery magic, alternative: BigQuery client see https://cloud.google.com/bigquery/docs/visualize-jupyter?hl=de#pip

In [27]:
%%bigquery cohort_df --project=$my_project_id 

WITH dicom_all_extended AS (
    SELECT
        *,
        CAST(SharedFunctionalGroupsSequence[OFFSET(0)].
             PixelMeasuresSequence[OFFSET(0)].
             PixelSpacing[OFFSET(0)] AS FLOAT64) AS pixel_spacing,
    FROM idc-dev-etl.idc_v3.dicom_all
)
SELECT
    ContainerIdentifier AS slide_id,
    PatientID AS patient_id,
    ClinicalTrialProtocolID AS cancer_subtype,
    TotalPixelMatrixColumns AS width,
    TotalPixelMatrixRows AS height,
    pixel_spacing,
    gcs_url
FROM dicom_all_extended
WHERE
    NOT (ContainerIdentifier IS NULL)
    AND (ClinicalTrialProtocolID = "CPTAC-LUAD"
        OR ClinicalTrialProtocolID = "CPTAC-LSCC")
    -- pixel spacing between 0.00025 and 0.00051 mm corresponds to 20x magnification
    AND (pixel_spacing > 0.00025) AND (pixel_spacing < 0.00051)

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 726.54query/s] 
Downloading: 100%|██████████| 2218/2218 [00:01<00:00, 1586.79rows/s]


In [28]:
print(cohort_df.head(), len(cohort_df))

       slide_id patient_id tumor_subtype  width  height  \
0  C3L-02629-26  C3L-02629    CPTAC-LSCC  73703   20873   
1  C3L-02660-24  C3L-02660    CPTAC-LSCC  59759   28076   
2  C3L-04086-27  C3L-04086    CPTAC-LSCC  13944   17721   
3  C3L-04749-26  C3L-04749    CPTAC-LSCC  75696   35827   
4  C3N-03093-24  C3N-03093    CPTAC-LSCC  35855   30619   

                                             gcs_url  
0  gs://idc_v3_cptac_lscc/79d63483-77aa-4b23-ab72...  
1  gs://idc_v3_cptac_lscc/77b8a091-3e05-4bde-aeaf...  
2  gs://idc_v3_cptac_lscc/ec1fed8c-be3d-49bb-9cb4...  
3  gs://idc_v3_cptac_lscc/94b5b0bf-c514-4d74-9f84...  
4  gs://idc_v3_cptac_lscc/0384023f-0ec9-4c9e-919b...   2218


In [29]:
cohort_df.to_csv('/home/jupyter/idc_input/cohort.csv', index=False)

In [30]:
tissue_type_data = pd.read_csv('/home/jupyter/idc_input/CPTAC_LUAD-LSCC_metadata_from_TCIA.csv')

In [31]:
def add_tissue_type_information(cohort_df, tissue_type_data):
    tissue_types = []
    for i, row in cohort_df.iterrows():
        slide_id = row['slide_id']
        try: 
            tissue_type = tissue_type_data[tissue_type_data['Slide_ID'] == slide_id]['Specimen_Type'].item()
            tissue_types.append(tissue_type)
        except: 
            cohort_df = cohort_df.drop(index=i)
    complete_df = _add_column_to_dataframe(cohort_df, tissue_types)
    
    # Replace certain column values for clarity
    complete_df.replace({'cancer_subtype': 'CPTAC-LSCC'}, 'LSCC', inplace=True)
    complete_df.replace({'cancer_subtype': 'CPTAC-LUAD'}, 'LUAD', inplace=True)
    complete_df.replace({'tissue_type': 'normal_tissue'}, 'normal', inplace=True)
    complete_df.replace({'tissue_type': 'tumor_tissue'}, 'tumor', inplace=True)
    return complete_df

def _add_column_to_dataframe(dataframe, column): 
    assert len(dataframe) == len(column), 'Number of new column values not matching length of dataframe.'
    dataframe.insert(3, 'tissue_type', column)
    return dataframe    

In [32]:
slides_metadata = add_tissue_type_information(cohort_df, tissue_type_data)
slides_metadata_path = '/home/jupyter/idc_input/slides_metadata.csv'
slides_metadata.to_csv(slides_metadata_path, index=False)

In [33]:
slides_metadata.head()

Unnamed: 0,slide_id,patient_id,tumor_subtype,tissue_type,width,height,gcs_url
0,C3L-02629-26,C3L-02629,LSCC,normal,73703,20873,gs://idc_v3_cptac_lscc/79d63483-77aa-4b23-ab72...
1,C3L-02660-24,C3L-02660,LSCC,tumor,59759,28076,gs://idc_v3_cptac_lscc/77b8a091-3e05-4bde-aeaf...
3,C3L-04749-26,C3L-04749,LSCC,normal,75696,35827,gs://idc_v3_cptac_lscc/94b5b0bf-c514-4d74-9f84...
4,C3N-03093-24,C3N-03093,LSCC,tumor,35855,30619,gs://idc_v3_cptac_lscc/0384023f-0ec9-4c9e-919b...
5,C3N-00175-27,C3N-00175,LUAD,normal,43823,35540,gs://idc_v3_cptac_luad/e1f9c5be-b90e-4ded-b0b9...


In [None]:
# IDC viewer for some pathomics data? 
def get_idc_viewer_url(study_UID):
    return "https://viewer.imaging.datacommons.cancer.gov/viewer/" + study_UID

# Temporary Data Download 

In [8]:
cohort_df['gcs_url'][:10].to_csv('/home/jupyter/gcs_paths.txt', header=False, index=False)
!cat /home/jupyter/gcs_paths.txt

gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302-1c432c69a653.dcm
gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c-59390f960a63.dcm
gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6-fe6e63daff42.dcm
gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa-143ee31f0647.dcm
gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414-093f049d3f3f.dcm
gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43-a07faa6a9251.dcm
gs://idc_v3_cptac_lscc/a9f76626-12e0-4a91-9ed4-e89f5e418c26.dcm
gs://idc_v3_cptac_luad/264ab7ce-a9d0-41a1-8c0d-faf19f013232.dcm
gs://idc_v3_cptac_luad/e08b9910-bdb8-4829-8048-e04e295c4780.dcm
gs://idc_v3_cptac_luad/8324067b-7036-46b6-ab23-5755437da523.dcm


In [9]:
!mkdir /home/jupyter/downloaded_cohort
!cat /home/jupyter/gcs_paths.txt | gsutil -u $my_project_id -m cp -I /home/jupyter/downloaded_cohort
# make use of dicomsort??? 

mkdir: cannot create directory ‘/home/jupyter/downloaded_cohort’: File exists
Copying gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302-1c432c69a653.dcm...
Copying gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c-59390f960a63.dcm...      
Copying gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6-fe6e63daff42.dcm...      
Copying gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa-143ee31f0647.dcm...      
Copying gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414-093f049d3f3f.dcm...      
Copying gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43-a07faa6a9251.dcm...      
Copying gs://idc_v3_cptac_lscc/a9f76626-12e0-4a91-9ed4-e89f5e418c26.dcm...      
Copying gs://idc_v3_cptac_luad/264ab7ce-a9d0-41a1-8c0d-faf19f013232.dcm...      
Copying gs://idc_v3_cptac_luad/e08b9910-bdb8-4829-8048-e04e295c4780.dcm...      
Copying gs://idc_v3_cptac_luad/8324067b-7036-46b6-ab23-5755437da523.dcm...
^C[2/10 files][  1.2 GiB/  3.0 GiB]  38% Done                                   


# Pathomics use case
## Preprocessing

In [9]:
#input_dir = os.environ['IDC_INPUT_DATA_DIR']
input_dir = '/home/jupyter/idc_input/'
slides_dir = os.path.join(input_dir, 'cptac_slides')
tiles_dir = os.path.join(input_dir, 'cptac_tiles')
#mkdir respective folders? and afterwards delete slide folder, because it is empty?  

In [15]:
# SLOW :( 
from data.tile_generation_cptac import generate_tiles

generate_tiles(slides_dir, os.path.join(input_dir, 'slides_metadata.csv'), tiles_dir, 'idc-pathomics-000')

Reading input data from /home/jupyter/idc_input/cptac_slides


Copying gs://idc_v3_cptac_lscc/79d63483-77aa-4b23-ab72-d7092dcf85fa.dcm...
/ [1 files][177.8 MiB/177.8 MiB]                                                
Operation completed over 1 objects/177.8 MiB.                                    


Processing: C3L-02629-26
15


Copying gs://idc_v3_cptac_lscc/77b8a091-3e05-4bde-aeaf-03d3b17fff7b.dcm...
| [1 files][436.4 MiB/436.4 MiB]                                                
Operation completed over 1 objects/436.4 MiB.                                    


Processing: C3L-02660-24
14


Copying gs://idc_v3_cptac_lscc/94b5b0bf-c514-4d74-9f84-d5ae6b328079.dcm...
/ [1 files][778.1 MiB/778.1 MiB]   55.6 MiB/s                                   
Operation completed over 1 objects/778.1 MiB.                                    


Processing: C3L-04749-26
15


KeyboardInterrupt: 

In [36]:
from data.tile_sorting_cptac import sort_tiles

#metadata_file = os.path.join(input_dir, 'metadata.cart.2017-03-02T00_36_30.276824.json')
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_cancer', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'luad_lusc', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_luad_lusc', magnification=5.0)

## Training
## Evaluation