# About

# Prerequisites

In [6]:
my_project_id = "idc-pathomics-000"

In [7]:
# Import all required python libraries for this use case
import os
import pandas as pd
import pydata_google_auth
from google.cloud.bigquery import magics 

In [8]:
credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery'],)
magics.context.credentials = credentials # credentials will be used later for BiqQuery queries

In [9]:
# BigQuery Authentification: https://cloud.google.com/docs/authentication/getting-started --> umgebungsvariable festlegen 
# https://medium.com/john-lewis-software-engineering/authenticating-jupyter-notebook-against-bigquery-957884f78527
# from command line
#!gcloud auth application-default login --yes
# when using google.colab
#from google.colab import auth 
#auth.authenticate_user()

# Environment setup

In [10]:
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print("Current directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

Current directory : /home/jupyter/idc-pathomics-use-case-1/src
Hostname          : idc-patho-vm-02
Username          : jupyter


In [11]:
%%capture
!sudo apt-get update
!sudo apt-get install --no-install-recommends -y python3-openslide

# Install other requirements or are these already available? 

# Dataset selection and exploration using BiqQuery

In [12]:
# Explanation where Attributes are obtained from etc. pp., explain BigQuery magic, alternative: BigQuery client see https://cloud.google.com/bigquery/docs/visualize-jupyter?hl=de#pip

In [13]:
%%bigquery cohort_df --project=$my_project_id 

SELECT
    b.ContainerIdentifier AS slide_id,
    b.PatientID AS patient_id,
    b.ClinicalTrialProtocolID AS tumor_subtype,
    b.TotalPixelMatrixColumns AS width,
    b.TotalPixelMatrixRows AS height,
    b.gcs_url
FROM
    -- ContainerIdentifier is not unique if slide was scanned twice.
    -- Therefore, identify slides by both ContainerIdentifier and FrameOfReferenceUID.
    (
        SELECT
            ContainerIdentifier,
            FrameOfReferenceUID,
            MAX(TotalPixelMatrixColumns * TotalPixelMatrixRows) AS max_size
        FROM idc-dev-etl.idc_v3.dicom_metadata
        WHERE
            NOT (ContainerIdentifier IS NULL)
            AND (ClinicalTrialProtocolID = "CPTAC-LUAD"
                OR ClinicalTrialProtocolID = "CPTAC-LSCC")
        GROUP BY ContainerIdentifier, FrameOfReferenceUID 
    ) AS a
    JOIN idc-dev-etl.idc_v3.dicom_all AS b ON
        b.ContainerIdentifier = a.ContainerIdentifier
        AND b.FrameOfReferenceUID = a.FrameOfReferenceUID
WHERE a.max_size = b.TotalPixelMatrixColumns * b.TotalPixelMatrixRows

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 736.10query/s] 
Downloading: 100%|██████████| 2218/2218 [00:02<00:00, 876.97rows/s] 


In [14]:
print(cohort_df.head(), len(cohort_df))

       slide_id patient_id tumor_subtype  width  height  \
0  C3L-04784-27  C3L-04784    CPTAC-LUAD  73704   34427   
1  C3N-04457-21  C3N-04457    CPTAC-LSCC  73704   33679   
2  C3L-02627-27  C3L-02627    CPTAC-LSCC  69719   22057   
3  C3N-02434-26  C3N-02434    CPTAC-LSCC  51791   21419   
4  C3N-02285-30  C3N-02285    CPTAC-LSCC  19919   21038   

                                             gcs_url  
0  gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302...  
1  gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c...  
2  gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6...  
3  gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa...  
4  gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414...   2218


In [15]:
cohort_df.to_csv('/home/jupyter/cohort.csv', index=False)

In [16]:
tissue_type_data = pd.read_csv('/home/jupyter/CPTAC_LUAD-LSCC_metadata_from_TCIA.csv')

In [19]:
def add_tissue_type_information(cohort_df, tissue_type_data):
    tissue_types = []
    for i, row in cohort_df.iterrows():
        slide_id = row['slide_id']
        try: 
            tissue_type = tissue_type_data[tissue_type_data['Slide_ID'] == slide_id]['Specimen_Type'].item()
            tissue_types.append(tissue_type)
        except: 
            cohort_df = cohort_df.drop(index=i)
    complete_df = _add_column_to_dataframe(cohort_df, tissue_types)
    
    # Replace certain column values for clarity
    complete_df.replace({'tumor_subtype': 'CPTAC-LSCC'}, 'LSCC', inplace=True)
    complete_df.replace({'tumor_subtype': 'CPTAC-LUAD'}, 'LUAD', inplace=True)
    complete_df.replace({'tissue_type': 'normal_tissue'}, 'normal', inplace=True)
    complete_df.replace({'tissue_type': 'tumor_tissue'}, 'tumor', inplace=True)
    return complete_df

def _add_column_to_dataframe(dataframe, column): 
    assert len(dataframe) == len(column), 'Number of new column values not matching length of dataframe.'
    dataframe.insert(3, 'tissue_type', column)
    return dataframe    

In [20]:
slides_metadata = add_tissue_type_information(cohort_df, tissue_type_data)
slides_metadata.to_csv('/home/jupyter/slides_metadata.csv', index=False)

In [22]:
slides_metadata.head()

Unnamed: 0,slide_id,patient_id,tumor_subtype,tissue_type,width,height,gcs_url
2212,C3L-04888-28,C3L-04888,LSCC,normal,39840,45168,gs://idc_v3_cptac_lscc/15eb8461-ca86-4fd8-be1e...
2213,C3L-05022-26,C3L-05022,LSCC,normal,21912,19502,gs://idc_v3_cptac_lscc/675db2c4-b4fb-4a19-9af5...
2214,C3L-02669-25,C3L-02669,LSCC,tumor,77687,20030,gs://idc_v3_cptac_lscc/d291390c-20a6-46e5-946f...
2216,C3L-00993-26,C3L-00993,LSCC,normal,37847,36071,gs://idc_v3_cptac_lscc/4f477204-3ecc-4c2e-ad17...
2217,C3N-03038-21,C3N-03038,LUAD,tumor,41831,16268,gs://idc_v3_cptac_luad/0252539c-a4aa-46e0-863e...


In [None]:
# IDC viewer for some pathomics data? 
def get_idc_viewer_url(study_UID):
    return "https://viewer.imaging.datacommons.cancer.gov/viewer/" + study_UID

# Temporary Data Download 

In [72]:
cohort_df['gcs_url'][:10].to_csv('/home/jupyter/gcs_paths.txt', header=False, index=False)
!cat /home/jupyter/gcs_paths.txt

gs://idc_v3_cptac_lscc/cf647e08-1b9c-4e5d-8ee1-5abf166560a2.dcm
gs://idc_v3_cptac_lscc/457c48a2-6392-4ff1-92d4-7baff70561a2.dcm
gs://idc_v3_cptac_lscc/5ec9dd12-432a-416b-a196-c0d0f9819b5f.dcm
gs://idc_v3_cptac_lscc/4423f531-e087-4e39-b865-1b95ccc85ee8.dcm
gs://idc_v3_cptac_luad/38db2080-5cf5-4799-9724-786decfffb27.dcm
gs://idc_v3_cptac_lscc/d45a691e-3510-480b-a1b1-d19b0da9c23d.dcm
gs://idc_v3_cptac_lscc/d905d5f8-04ce-413a-8a7c-528361b30757.dcm
gs://idc_v3_cptac_lscc/1523696b-949e-4da1-9996-4f1855ac836a.dcm
gs://idc_v3_cptac_lscc/2b431901-91d1-4e58-8d38-6c48abdd6e27.dcm
gs://idc_v3_cptac_luad/de63298e-4bbb-460c-8a76-1fee1e91fd4c.dcm


In [73]:
!mkdir /home/jupyter/downloaded_cohort
!cat /home/jupyter/gcs_paths.txt | gsutil -u $my_project_id -m cp -I /home/jupyter/downloaded_cohort
# make use of dicomsort??? 

Copying gs://idc_v3_cptac_lscc/cf647e08-1b9c-4e5d-8ee1-5abf166560a2.dcm...
Copying gs://idc_v3_cptac_lscc/457c48a2-6392-4ff1-92d4-7baff70561a2.dcm...      
Copying gs://idc_v3_cptac_lscc/5ec9dd12-432a-416b-a196-c0d0f9819b5f.dcm...      
Copying gs://idc_v3_cptac_lscc/4423f531-e087-4e39-b865-1b95ccc85ee8.dcm...      
Copying gs://idc_v3_cptac_luad/38db2080-5cf5-4799-9724-786decfffb27.dcm...      
Copying gs://idc_v3_cptac_lscc/d45a691e-3510-480b-a1b1-d19b0da9c23d.dcm...      
Copying gs://idc_v3_cptac_lscc/d905d5f8-04ce-413a-8a7c-528361b30757.dcm...      
Copying gs://idc_v3_cptac_lscc/1523696b-949e-4da1-9996-4f1855ac836a.dcm...      
Copying gs://idc_v3_cptac_lscc/2b431901-91d1-4e58-8d38-6c48abdd6e27.dcm...      
Copying gs://idc_v3_cptac_luad/de63298e-4bbb-460c-8a76-1fee1e91fd4c.dcm...      
| [10/10 files][  3.4 GiB/  3.4 GiB] 100% Done  49.7 MiB/s ETA 00:00:00         
Operation completed over 10 objects/3.4 GiB.                                     


# Pathomics use case
## Preprocessing

In [None]:
input_dir = os.environ['IDC_INPUT_DATA_DIR']
slides_dir = os.path.join(input_dir, 'cptac_slides')
tiles_dir = os.path.join(input_dir, 'cptac_tiles')

In [None]:
from data.tile_generation_cptac import generate_tiles

generate_tiles(slides_dir, tiles_dir)

In [None]:
#from data.tile_sorting import sort_tiles

#metadata_file = os.path.join(input_dir, 'metadata.cart.2017-03-02T00_36_30.276824.json')
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_cancer', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'luad_lusc', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_luad_lusc', magnification=5.0)

## Training
## Evaluation