# About

# Prerequisites

In [12]:
my_project_id = "idc-pathomics-000"

In [13]:
# Import all required python libraries for this use case
import os
import pandas as pd
!pip install pydata_google_auth
import pydata_google_auth
from google.cloud.bigquery import magics 



In [14]:
credentials = pydata_google_auth.get_user_credentials(['https://www.googleapis.com/auth/bigquery'],)
magics.context.credentials = credentials # credentials will be used later for BiqQuery queries

In [15]:
# BigQuery Authentification: https://cloud.google.com/docs/authentication/getting-started --> umgebungsvariable festlegen 
# https://medium.com/john-lewis-software-engineering/authenticating-jupyter-notebook-against-bigquery-957884f78527
# from command line
#!gcloud auth application-default login --yes
# when using google.colab
#from google.colab import auth 
#auth.authenticate_user()

# Environment setup

In [16]:
curr_dir = !pwd
curr_droid = !hostname
curr_pilot = !whoami

print("Current directory :", curr_dir[-1])
print("Hostname          :", curr_droid[-1])
print("Username          :", curr_pilot[-1])

Current directory : /home/jupyter/idc-pathomics-use-case-1/src
Hostname          : idc-patho-vm
Username          : jupyter


In [17]:
%%capture
!sudo apt-get update
!sudo apt-get install --no-install-recommends -y python3-openslide

In [43]:
# Install other requirements or are these already available? Check!
!sudo pip3 install -r ../requirements.txt

Collecting gsutil (from -r ../requirements.txt (line 6))
  Using cached https://files.pythonhosted.org/packages/78/bf/9665adca43740ca4a0adde1525da1590d9ce65189cec17abd86a7ab30f75/gsutil-4.65.tar.gz
Collecting gcs-oauth2-boto-plugin>=2.7 (from gsutil->-r ../requirements.txt (line 6))
  Using cached https://files.pythonhosted.org/packages/f7/ab/3cc16742de84b76aa328c4b9e09fbf88447027827c12fb3913c5907be23b/gcs-oauth2-boto-plugin-2.7.tar.gz
Collecting google-apitools>=0.5.32 (from gsutil->-r ../requirements.txt (line 6))
  Using cached https://files.pythonhosted.org/packages/5e/cb/cb0311f2ec371c83d6510847476c665edc9cc97564a51923557bc8f0b680/google_apitools-0.5.32-py3-none-any.whl
Collecting mock==2.0.0 (from gsutil->-r ../requirements.txt (line 6))
  Using cached https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl
Collecting monotonic>=1.4 (from gsutil->-r ../requirements.txt (line 6))
  Using cached http

In [42]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-21.2.1-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 7.9 MB/s eta 0:00:01
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-21.2.1


# Dataset selection and exploration using BiqQuery

In [18]:
# Explanation where Attributes are obtained from etc. pp., explain BigQuery magic, alternative: BigQuery client see https://cloud.google.com/bigquery/docs/visualize-jupyter?hl=de#pip

In [19]:
%%bigquery cohort_df --project=$my_project_id 

SELECT
    b.ContainerIdentifier AS slide_id,
    b.PatientID AS patient_id,
    b.ClinicalTrialProtocolID AS tumor_subtype,
    b.TotalPixelMatrixColumns AS width,
    b.TotalPixelMatrixRows AS height,
    b.gcs_url
FROM
    -- ContainerIdentifier is not unique if slide was scanned twice.
    -- Therefore, identify slides by both ContainerIdentifier and FrameOfReferenceUID.
    (
        SELECT
            ContainerIdentifier,
            FrameOfReferenceUID,
            MAX(TotalPixelMatrixColumns * TotalPixelMatrixRows) AS max_size
        FROM idc-dev-etl.idc_v3.dicom_metadata
        WHERE
            NOT (ContainerIdentifier IS NULL)
            AND (ClinicalTrialProtocolID = "CPTAC-LUAD"
                OR ClinicalTrialProtocolID = "CPTAC-LSCC")
        GROUP BY ContainerIdentifier, FrameOfReferenceUID 
    ) AS a
    JOIN idc-dev-etl.idc_v3.dicom_all AS b ON
        b.ContainerIdentifier = a.ContainerIdentifier
        AND b.FrameOfReferenceUID = a.FrameOfReferenceUID
WHERE a.max_size = b.TotalPixelMatrixColumns * b.TotalPixelMatrixRows

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 866.05query/s] 
Downloading: 100%|██████████| 2218/2218 [00:01<00:00, 1914.14rows/s]


In [20]:
print(cohort_df.head(), len(cohort_df))

       slide_id patient_id tumor_subtype  width  height  \
0  C3L-04784-27  C3L-04784    CPTAC-LUAD  73704   34427   
1  C3N-04457-21  C3N-04457    CPTAC-LSCC  73704   33679   
2  C3L-02627-27  C3L-02627    CPTAC-LSCC  69719   22057   
3  C3N-02434-26  C3N-02434    CPTAC-LSCC  51791   21419   
4  C3N-02285-30  C3N-02285    CPTAC-LSCC  19919   21038   

                                             gcs_url  
0  gs://idc_v3_cptac_luad/0eb19c55-3bb1-4448-8302...  
1  gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c...  
2  gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6...  
3  gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa...  
4  gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414...   2218


In [21]:
cohort_df.to_csv('/home/jupyter/idc_input/cohort.csv', index=False)

In [25]:
tissue_type_data = pd.read_csv('/home/jupyter/idc_input/CPTAC_LUAD-LSCC_metadata_from_TCIA.csv')

In [27]:
def add_tissue_type_information(cohort_df, tissue_type_data):
    tissue_types = []
    for i, row in cohort_df.iterrows():
        slide_id = row['slide_id']
        try: 
            tissue_type = tissue_type_data[tissue_type_data['Slide_ID'] == slide_id]['Specimen_Type'].item()
            tissue_types.append(tissue_type)
        except: 
            cohort_df = cohort_df.drop(index=i)
    complete_df = _add_column_to_dataframe(cohort_df, tissue_types)
    
    # Replace certain column values for clarity
    complete_df.replace({'tumor_subtype': 'CPTAC-LSCC'}, 'LSCC', inplace=True)
    complete_df.replace({'tumor_subtype': 'CPTAC-LUAD'}, 'LUAD', inplace=True)
    complete_df.replace({'tissue_type': 'normal_tissue'}, 'normal', inplace=True)
    complete_df.replace({'tissue_type': 'tumor_tissue'}, 'tumor', inplace=True)
    return complete_df

def _add_column_to_dataframe(dataframe, column): 
    assert len(dataframe) == len(column), 'Number of new column values not matching length of dataframe.'
    dataframe.insert(3, 'tissue_type', column)
    return dataframe    

In [28]:
slides_metadata = add_tissue_type_information(cohort_df, tissue_type_data)
slides_metadata.to_csv('/home/jupyter/idc_input/slides_metadata.csv', index=False)

In [29]:
slides_metadata.head()

Unnamed: 0,slide_id,patient_id,tumor_subtype,tissue_type,width,height,gcs_url
1,C3N-04457-21,C3N-04457,LSCC,tumor,73704,33679,gs://idc_v3_cptac_lscc/46e1a340-c7f0-434e-ac5c...
2,C3L-02627-27,C3L-02627,LSCC,normal,69719,22057,gs://idc_v3_cptac_lscc/b8e15005-14bd-4c2b-83c6...
3,C3N-02434-26,C3N-02434,LSCC,normal,51791,21419,gs://idc_v3_cptac_lscc/4285b4ed-01d6-45b1-b7aa...
4,C3N-02285-30,C3N-02285,LSCC,normal,19919,21038,gs://idc_v3_cptac_lscc/86f1e27d-34f2-41f9-9414...
5,C3N-03441-21,C3N-03441,LSCC,tumor,59759,19141,gs://idc_v3_cptac_lscc/111e0317-009c-4167-8d43...


In [None]:
# IDC viewer for some pathomics data? 
def get_idc_viewer_url(study_UID):
    return "https://viewer.imaging.datacommons.cancer.gov/viewer/" + study_UID

# Temporary Data Download 

In [72]:
cohort_df['gcs_url'][:10].to_csv('/home/jupyter/gcs_paths.txt', header=False, index=False)
!cat /home/jupyter/gcs_paths.txt

gs://idc_v3_cptac_lscc/cf647e08-1b9c-4e5d-8ee1-5abf166560a2.dcm
gs://idc_v3_cptac_lscc/457c48a2-6392-4ff1-92d4-7baff70561a2.dcm
gs://idc_v3_cptac_lscc/5ec9dd12-432a-416b-a196-c0d0f9819b5f.dcm
gs://idc_v3_cptac_lscc/4423f531-e087-4e39-b865-1b95ccc85ee8.dcm
gs://idc_v3_cptac_luad/38db2080-5cf5-4799-9724-786decfffb27.dcm
gs://idc_v3_cptac_lscc/d45a691e-3510-480b-a1b1-d19b0da9c23d.dcm
gs://idc_v3_cptac_lscc/d905d5f8-04ce-413a-8a7c-528361b30757.dcm
gs://idc_v3_cptac_lscc/1523696b-949e-4da1-9996-4f1855ac836a.dcm
gs://idc_v3_cptac_lscc/2b431901-91d1-4e58-8d38-6c48abdd6e27.dcm
gs://idc_v3_cptac_luad/de63298e-4bbb-460c-8a76-1fee1e91fd4c.dcm


In [73]:
!mkdir /home/jupyter/downloaded_cohort
!cat /home/jupyter/gcs_paths.txt | gsutil -u $my_project_id -m cp -I /home/jupyter/downloaded_cohort
# make use of dicomsort??? 

Copying gs://idc_v3_cptac_lscc/cf647e08-1b9c-4e5d-8ee1-5abf166560a2.dcm...
Copying gs://idc_v3_cptac_lscc/457c48a2-6392-4ff1-92d4-7baff70561a2.dcm...      
Copying gs://idc_v3_cptac_lscc/5ec9dd12-432a-416b-a196-c0d0f9819b5f.dcm...      
Copying gs://idc_v3_cptac_lscc/4423f531-e087-4e39-b865-1b95ccc85ee8.dcm...      
Copying gs://idc_v3_cptac_luad/38db2080-5cf5-4799-9724-786decfffb27.dcm...      
Copying gs://idc_v3_cptac_lscc/d45a691e-3510-480b-a1b1-d19b0da9c23d.dcm...      
Copying gs://idc_v3_cptac_lscc/d905d5f8-04ce-413a-8a7c-528361b30757.dcm...      
Copying gs://idc_v3_cptac_lscc/1523696b-949e-4da1-9996-4f1855ac836a.dcm...      
Copying gs://idc_v3_cptac_lscc/2b431901-91d1-4e58-8d38-6c48abdd6e27.dcm...      
Copying gs://idc_v3_cptac_luad/de63298e-4bbb-460c-8a76-1fee1e91fd4c.dcm...      
| [10/10 files][  3.4 GiB/  3.4 GiB] 100% Done  49.7 MiB/s ETA 00:00:00         
Operation completed over 10 objects/3.4 GiB.                                     


# Pathomics use case
## Preprocessing

In [34]:
#input_dir = os.environ['IDC_INPUT_DATA_DIR']
input_dir = '/home/jupyter/idc_input/'
slides_dir = os.path.join(input_dir, 'cptac_slides')
tiles_dir = os.path.join(input_dir, 'cptac_tiles')
#mkdir respective folders? 

In [37]:
from data.tile_generation_cptac import generate_tiles

generate_tiles(slides_dir, tiles_dir)

ModuleNotFoundError: No module named 'openslide'

In [None]:
#from data.tile_sorting import sort_tiles

#metadata_file = os.path.join(input_dir, 'metadata.cart.2017-03-02T00_36_30.276824.json')
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_cancer', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'luad_lusc', magnification=5.0)
#sort_tiles(tiles_dir, metadata_file, input_dir, 'norm_luad_lusc', magnification=5.0)

## Training
## Evaluation