# About

In [1]:
%load_ext autoreload
%autoreload 2

# Prerequisites and Environment setup

In [2]:
my_project_id = "idc-pathomics-000"

In [3]:
# Import all required python libraries for this use case
import os
import sys
import pandas as pd

In [4]:
# BigQuery Authentification: https://cloud.google.com/docs/authentication/getting-started --> umgebungsvariable festlegen 
# https://medium.com/john-lewis-software-engineering/authenticating-jupyter-notebook-against-bigquery-957884f78527
# from command line
#!gcloud auth application-default login --yes
# when using google.colab
#from google.colab import auth 
#auth.authenticate_user()

#!gcloud auth list
#!gcloud config set account dschacherer.fme@gmail.com
#!gcloud auth list

# need to get the right scope here: https://developers.google.com/identity/protocols/oauth2/scopes, pydata Documentation: https://pydata-google-auth.readthedocs.io/_/downloads/en/latest/pdf/
#credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery', 'https://www.googleapis.com/auth/devstorage.full_control'],)
#magics.context.credentials = credentials # credentials will be used later for BiqQuery queries

In [5]:
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print("Current directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

Current directory : /home/jupyter/idc-pathomics-use-case-1/src
Hostname          : idc-patho-vm
Username          : jupyter


In [6]:
!sudo apt-get update
!sudo apt-get install --no-install-recommends -y python3-openslide
!sudo pip3 install -r ../requirements_cptac.txt # check whether all are required
sys.path.append('/usr/local/lib/python3.7/dist-packages') # otherwise Openslide can not be loaded. 

Hit:1 http://deb.debian.org/debian buster InRelease
Hit:2 http://deb.debian.org/debian buster-updates InRelease                    
Hit:3 http://security.debian.org/debian-security buster/updates InRelease      
Hit:4 http://deb.debian.org/debian buster-backports InRelease                  
Hit:5 https://download.docker.com/linux/debian buster InRelease                
Hit:6 https://nvidia.github.io/libnvidia-container/stable/debian10/amd64  InRelease
Hit:7 https://nvidia.github.io/nvidia-container-runtime/stable/debian10/amd64  InRelease
Get:8 http://packages.cloud.google.com/apt cloud-sdk-buster InRelease [6774 B]
Hit:9 https://nvidia.github.io/nvidia-docker/debian10/amd64  InRelease         
Hit:11 https://packages.cloud.google.com/apt google-fast-socket InRelease
Get:12 http://packages.cloud.google.com/apt google-cloud-packages-archive-keyring-buster InRelease [5553 B]
Hit:10 https://packages.cloud.google.com/apt kubernetes-xenial InRelease
Hit:13 http://packages.cloud.google.com/a

# Dataset selection and exploration using BiqQuery

In [26]:
# Explanation where Attributes are obtained from etc. pp., explain BigQuery magic, alternative: BigQuery client see https://cloud.google.com/bigquery/docs/visualize-jupyter?hl=de#pip

In [13]:
%%bigquery cohort_df --project=$my_project_id 

WITH dicom_all_extended AS (
    SELECT
        *,
        CAST(SharedFunctionalGroupsSequence[OFFSET(0)].
             PixelMeasuresSequence[OFFSET(0)].
             PixelSpacing[OFFSET(0)] AS FLOAT64) AS pixel_spacing,
    FROM idc-dev-etl.idc_v3.dicom_all
)
SELECT
    ContainerIdentifier AS slide_id,
    PatientID AS patient_id,
    ClinicalTrialProtocolID AS cancer_subtype,
    TotalPixelMatrixColumns AS width,
    TotalPixelMatrixRows AS height,
    pixel_spacing,
    gcs_url
FROM dicom_all_extended
WHERE
    NOT (ContainerIdentifier IS NULL)
    AND (ClinicalTrialProtocolID = "CPTAC-LUAD"
        OR ClinicalTrialProtocolID = "CPTAC-LSCC")
    -- pixel spacing between 0.00025 and 0.00051 mm corresponds to 20x magnification
    AND (pixel_spacing > 0.00025) AND (pixel_spacing < 0.00051)

Query complete after 0.01s: 100%|██████████| 4/4 [00:00<00:00, 2247.45query/s]                        
Downloading: 100%|██████████| 2218/2218 [00:01<00:00, 1444.20rows/s]


In [41]:
print(cohort_df.head(), len(cohort_df))

       slide_id patient_id cancer_subtype  width  height  pixel_spacing  \
0  C3L-00415-26  C3L-00415     CPTAC-LSCC  35855   25822       0.000494   
1  C3L-00503-26  C3L-00503     CPTAC-LSCC  21911   28220       0.000494   
2  C3L-00568-22  C3L-00568     CPTAC-LSCC  17927   20798       0.000494   
3  C3L-00604-22  C3L-00604     CPTAC-LUAD  19919   19378       0.000494   
4  C3L-00904-21  C3L-00904     CPTAC-LSCC  37847   50544       0.000494   

                                             gcs_url  
0  gs://idc_v3_cptac_lscc/f087784e-9bae-407a-aa12...  
1  gs://idc_v3_cptac_lscc/cf1a52ff-1a1b-41e3-9845...  
2  gs://idc_v3_cptac_lscc/16dc6535-ff33-4ddf-937d...  
3  gs://idc_v3_cptac_luad/ea1c7876-c547-4abf-ab99...  
4  gs://idc_v3_cptac_lscc/770fd990-2125-4ec9-b518...   2218


In [42]:
cohort_df.to_csv('/home/jupyter/idc_input/cohort.csv', index=False)

In [43]:
tissue_type_data = pd.read_csv('/home/jupyter/idc_input/CPTAC_LUAD-LSCC_metadata_from_TCIA.csv')

In [50]:
def add_tissue_type_information(cohort_df, tissue_type_data):
    tissue_types = []
    for i, row in cohort_df.iterrows():
        slide_id = row['slide_id']
        try: 
            tissue_type = tissue_type_data[tissue_type_data['Slide_ID'] == slide_id]['Specimen_Type'].item()
            tissue_types.append(tissue_type)
        except: 
            cohort_df = cohort_df.drop(index=i)
    complete_df = _add_column_to_dataframe(cohort_df, tissue_types)
    
    # Replace certain column values for clarity
    complete_df.replace({'cancer_subtype': 'CPTAC-LSCC'}, 'lscc', inplace=True)
    complete_df.replace({'cancer_subtype': 'CPTAC-LUAD'}, 'luad', inplace=True)
    complete_df.replace({'tissue_type': 'normal_tissue'}, 'normal', inplace=True)
    complete_df.replace({'tissue_type': 'tumor_tissue'}, 'tumor', inplace=True)
    complete_df.sort_values('slide_id')
    return complete_df

def _add_column_to_dataframe(dataframe, column): 
    assert len(dataframe) == len(column), 'Number of new column values not matching length of dataframe.'
    dataframe.insert(3, 'tissue_type', column)
    return dataframe    

In [51]:
slides_metadata = add_tissue_type_information(cohort_df, tissue_type_data)
slides_metadata_path = '/home/jupyter/idc_input/slides_metadata.csv'
slides_metadata.to_csv(slides_metadata_path, index=False)

In [52]:
slides_metadata.head()

Unnamed: 0,slide_id,patient_id,cancer_subtype,tissue_type,width,height,pixel_spacing,gcs_url
0,C3L-00415-26,C3L-00415,lscc,normal,35855,25822,0.000494,gs://idc_v3_cptac_lscc/f087784e-9bae-407a-aa12...
1,C3L-00503-26,C3L-00503,lscc,normal,21911,28220,0.000494,gs://idc_v3_cptac_lscc/cf1a52ff-1a1b-41e3-9845...
2,C3L-00568-22,C3L-00568,lscc,tumor,17927,20798,0.000494,gs://idc_v3_cptac_lscc/16dc6535-ff33-4ddf-937d...
3,C3L-00604-22,C3L-00604,luad,tumor,19919,19378,0.000494,gs://idc_v3_cptac_luad/ea1c7876-c547-4abf-ab99...
4,C3L-00904-21,C3L-00904,lscc,tumor,37847,50544,0.000494,gs://idc_v3_cptac_lscc/770fd990-2125-4ec9-b518...


In [None]:
# IDC viewer for some pathomics data? 
def get_idc_viewer_url(study_UID):
    return "https://viewer.imaging.datacommons.cancer.gov/viewer/" + study_UID

# Pathomics use case
## Preprocessing

In [47]:
#input_dir = os.environ['IDC_INPUT_DATA_DIR']
#output_dir = os.environ['IDC_OUTPUT_DATA_DIR']
input_dir = '/home/jupyter/idc_input/'
output_dir = '/home/jupyter/idc_output/'

slides_dir = os.path.join(input_dir, 'cptac_slides')
tiles_dir = os.path.join(input_dir, 'cptac_tiles_512')
#mkdir respective folders? and afterwards delete slide folder, because it is empty?  

In [15]:
# SLOW :( 
from data.tile_generation_cptac import generate_tiles

generate_tiles(slides_dir, os.path.join(input_dir, 'slides_metadata.csv'), tiles_dir, 'idc-pathomics-000')

Reading input data from /home/jupyter/idc_input/cptac_slides


Copying gs://idc_v3_cptac_lscc/79d63483-77aa-4b23-ab72-d7092dcf85fa.dcm...
/ [1 files][177.8 MiB/177.8 MiB]                                                
Operation completed over 1 objects/177.8 MiB.                                    


Processing: C3L-02629-26
15


Copying gs://idc_v3_cptac_lscc/77b8a091-3e05-4bde-aeaf-03d3b17fff7b.dcm...
| [1 files][436.4 MiB/436.4 MiB]                                                
Operation completed over 1 objects/436.4 MiB.                                    


Processing: C3L-02660-24
14


Copying gs://idc_v3_cptac_lscc/94b5b0bf-c514-4d74-9f84-d5ae6b328079.dcm...
/ [1 files][778.1 MiB/778.1 MiB]   55.6 MiB/s                                   
Operation completed over 1 objects/778.1 MiB.                                    


Processing: C3L-04749-26
15


KeyboardInterrupt: 

In [56]:
from data.tile_sorting_cptac import sort_tiles

slides_metadata_path = '/home/jupyter/idc_input/slides_metadata.csv'
#sort_tiles(tiles_dir, slides_metadata_path, input_dir, 'norm_cancer')
#sort_tiles(tiles_dir, slides_metadata_path, input_dir, 'luad_lscc')
sort_tiles(tiles_dir, slides_metadata_path, input_dir, 'norm_luad_lscc')

## Training

In [None]:
output_dir_experiment = os.path.join(output_dir, time.strftime("%Y%m%d_%H%M%S"))
os.mkdir(output_dir_experiment)

dataset_train = Dataset(os.path.join(input_dir, 'train_norm_luad_lscc.csv'), num_classes=3)
dataset_valid = Dataset(os.path.join(input_dir, 'valid_norm_luad_lscc.csv'), num_classes=3)
model = InceptionModel(num_classes=3, input_shape=(128,128,3), learning_rate=0.01)
print(model)

model.train(dataset_train, batch_size=64, epochs=10, output_path=output_dir, validation_dataset=dataset_valid)
model.save(output_dir)

## Evaluation