# About

# Prerequisites

In [1]:
my_project_id = "idc-pathomics-000"

In [2]:
# Import all required python libraries for this use case
import os
import pandas as pd
import pydata_google_auth
from google.cloud.bigquery import magics 

In [3]:
credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery'],)
magics.context.credentials = credentials # credentials will be used later for BiqQuery queries

In [4]:
# BigQuery Authentification: https://cloud.google.com/docs/authentication/getting-started --> umgebungsvariable festlegen 
# https://medium.com/john-lewis-software-engineering/authenticating-jupyter-notebook-against-bigquery-957884f78527
# from command line
#!gcloud auth application-default login --yes
# when using google.colab
#from google.colab import auth 
#auth.authenticate_user()

# Environment setup

In [5]:
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print("Current directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

Current directory : /home/jupyter/idc-pathomics-use-case-1/src
Hostname          : idc-patho-vm-02
Username          : jupyter


In [6]:
%%capture
!sudo apt-get update
!sudo apt-get install --no-install-recommends -y python3-openslide

# Install other requirements or are these already available? 

# Dataset selection and exploration using BiqQuery

In [7]:
# Explanation where Attributes are obtained from etc. pp., explain BigQuery magic, alternative: BigQuery client see https://cloud.google.com/bigquery/docs/visualize-jupyter?hl=de#pip

In [4]:
%%bigquery cohort_df --project=$my_project_id 

SELECT
    b.ContainerIdentifier AS slide_id,
    b.PatientID AS patient_id,
    b.ClinicalTrialProtocolID AS tumor_subtype,
    b.TotalPixelMatrixColumns AS width,
    b.TotalPixelMatrixRows AS height,
    b.gcs_url
FROM
    (SELECT
        ContainerIdentifier,
        MAX(TotalPixelMatrixColumns * TotalPixelMatrixRows) AS max_size
    FROM idc-dev-etl.idc_v3.dicom_metadata
    WHERE
        NOT (ContainerIdentifier IS NULL)
        AND (ClinicalTrialProtocolID = "CPTAC-LUAD"
            OR ClinicalTrialProtocolID = "CPTAC-LSCC")
    GROUP BY ContainerIdentifier) AS a
    JOIN idc-dev-etl.idc_v3.dicom_all AS b ON b.ContainerIdentifier = a.ContainerIdentifier
WHERE a.max_size = b.TotalPixelMatrixColumns * b.TotalPixelMatrixRows

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 902.97query/s] 
Downloading: 100%|██████████| 2218/2218 [00:02<00:00, 1069.55rows/s]


In [9]:
print(cohort_df.head())

       slide_id patient_id tumor_subtype  width  height  \
0  C3L-02650-22  C3L-02650    CPTAC-LSCC  77687   29127   
1  C3L-04863-26  C3L-04863    CPTAC-LSCC  49800   38400   
2  C3L-01663-26  C3L-01663    CPTAC-LSCC  27887   20078   
3  C3L-04873-24  C3L-04873    CPTAC-LSCC  83664   41490   
4  C3L-02616-23  C3L-02616    CPTAC-LUAD  59759   29816   

                                             gcs_url  
0  gs://idc_v3_cptac_lscc/2ac6fcc6-c424-4401-bf9a...  
1  gs://idc_v3_cptac_lscc/5ce6ce44-986a-4f90-82d7...  
2  gs://idc_v3_cptac_lscc/5ac6c7a8-7369-4ab5-b3c7...  
3  gs://idc_v3_cptac_lscc/07f1d820-01d2-495d-b350...  
4  gs://idc_v3_cptac_luad/379ea225-b8cc-4f45-9d1b...  


In [None]:
# IDC viewer for some pathomics data? 
def get_idc_viewer_url(study_UID):
    return "https://viewer.imaging.datacommons.cancer.gov/viewer/" + study_UID

# Pathomics use case
## Preprocessing

In [None]:
input_dir = os.environ['IDC_INPUT_DATA_DIR']
slides_dir = os.path.join(input_dir, 'cptac_slides')
tiles_dir = os.path.join(input_dir, 'cptac_tiles')

In [None]:
from data.tile_generation_cptac import generate_tiles

generate_tiles(slides_dir, tiles_dir, desired_magnification=20.0)

In [None]:
#from data.tile_sorting import sort_tiles

#metadata_file = os.path.join(input_dir, 'metadata.cart.2017-03-02T00_36_30.276824.json')
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_cancer', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'luad_lusc', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_luad_lusc', magnification=5.0)

## Training
## Evaluation