### 目的
- RNA-seqのデータを用いたがん種予測アルゴリズムの構築
- TOP RNA panelデータを変換しがん種予測を行う

CaSPRE: Cancer subtype Prediction with RNA Expression

In [None]:
# necessary modules

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report, recall_score, log_loss
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn import preprocessing
import sklearn.model_selection
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_classif, SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, matthews_corrcoef
import numpy as np
import arviz
import tensorflow as tf
from scipy import linalg
from scipy.stats import norm
from statsmodels.stats.outliers_influence import summary_table
from statsmodels.sandbox.regression.predstd import wls_prediction_std

import keras
from tensorflow.keras import backend as K 
from keras.utils import np_utils
from tensorflow.keras.models import Model, load_model, model_from_json, Sequential, model_from_config
from tensorflow.keras.layers import concatenate, ThresholdedReLU, PReLU, LeakyReLU, ReLU, Activation, Conv1D, Conv2D, SeparableConv1D, SeparableConv2D, MaxPooling1D, MaxPooling2D,Input,Dense,AveragePooling2D, Dropout, Flatten, Activation, GlobalAveragePooling2D, Lambda, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, Callback ,TensorBoard, LearningRateScheduler
from tensorflow.keras.preprocessing import image as images
from tensorflow.keras.preprocessing.image import array_to_img, img_to_array, load_img
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras import initializers
from tensorflow.keras.metrics import TruePositives, TrueNegatives, FalsePositives, FalseNegatives, Precision, Recall
from tensorflow.python.framework import ops
import cv2
import gc

from sklearn.datasets import load_boston
import statsmodels.api as sm

import importlib
import pystan
import warnings
import requests
import pandas as pd
import cv2
import sys
import json
import os, os.path
import pathlib
import shutil
import math
import codecs
import time
import matplotlib

#########
matplotlib.use('Agg')

#%matplotlib inline
#plt.rc("figure", figsize=(16,8))
#plt.rc("font", size=14)
#########

import matplotlib.pyplot as plt
from PIL import Image
from glob import glob
import seaborn as sns


In [288]:
# settings

###################
# CHANGE the home directory for your environment.
home_dir = '/mnt/HDD8TB/CaSPRE'
###################

seed1 = 1
seed2 = 2

# setting for k-fold cross validation
n_Splits = 5
split = 0.25

# delete very rare classes 
factor_no = 9

data_dir =  "/mnt/HDD8TB/CaSPRE/TCGA_sort"
result_dir =  "/mnt/HDD8TB/CaSPRE/TCGA_TOP_genes"
learning = 0.0001
X_size = 525
Y_size = 1
batch_size = 64
epochs = 1024
patience = 32

acc_name = result_dir + "/accuracy.txt"
acc_table = result_dir + "/acc_table.tsv"


if not os.path.exists(data_dir):
    os.mkdir(data_dir)
if not os.path.exists(result_dir):
    os.mkdir(result_dir)

n_categories = len(os.listdir(data_dir))

def cal_weight(class_name_list,IN_DIR):
    amounts_of_class_dict = {}
    mx = 0
    tot = 0
    for class_name in class_name_list:
        class_dir = IN_DIR + os.sep + class_name
        file_list = os.listdir(class_dir)
        amounts_of_class_dict[class_name] = len(file_list)
        if mx < len(file_list):
            mx = len(file_list)
    class_weights = {}
    count = 0
    for class_name in class_name_list:
        class_weights[count] = round(float(math.pow(amounts_of_class_dict[class_name]/mx, -1)),2)
        # weight = 1/(data nubmer/ MAX data number)
        count += 1
    return class_weights


In [None]:
df_genes = pd.read_csv(home_dir + '/gene_list.csv', header=None, )
df_ENSG = pd.read_table(home_dir + '/ENSG_to_NCBI.tsv', header=0, index_col=0)
df_FPKM = pd.read_table(home_dir + '/FPKM_test.tsv', header=None)

ENSG_list = (df_ENSG[df_ENSG["external_gene_name"].isin(df_genes[0])]).index.drop_duplicates()
df_FPKM["id"] = df_FPKM[0].str.split(pat = ".", expand = True)[0]
ID_list = df_FPKM[df_FPKM["id"].isin(ENSG_list)][1]

np.save("ID_list.npy", ID_list)


In [None]:
# TCGA FPKM data
ID_list = np.load("ID_list.npy")
datapath = "/mnt/HDD8TB/CaSPRE/FPKM_calc/expression_data/*/*FPKM.txt.gz"
files = glob(datapath)  
files = sorted(files)

dir = '/mnt/HDD8TB/CaSPRE/TCGA_values'

In [None]:
# Normal sample RNA-seq FPKM data
ID_list = np.load("ID_list.npy")
datapath = "/mnt/HDD8TB/CaSPRE/RNA_normal_FPKM/*FPKM.txt"
files = glob(datapath)  
files = sorted(files)

dir = '/mnt/HDD8TB/CaSPRE/Normal_RNA_values'

In [None]:
# Normal sample TOP-RNA FPKM data
ID_list = np.load("ID_list.npy")
datapath = "/mnt/HDD8TB/CaSPRE/TOP_normal_FPKM/*FPKM.txt"
files = glob(datapath)  
files = sorted(files)

dir = '/mnt/HDD8TB/CaSPRE/Normal_TOP_values'

In [None]:
# Tumor sample TOP-RNA FPKM data
ID_list = np.load("ID_list.npy")
datapath = "/mnt/HDD8TB/CaSPRE/TOP_FPKM/*FPKM.txt"
files = glob(datapath)  
files = sorted(files)

dir = '/mnt/HDD8TB/CaSPRE/Tumor_TOP_values'

In [None]:
# Save FPKM of selected genes in npy file
for dataNb in range(len(files)):
    filename = files[dataNb]
    basename = os.path.splitext(os.path.basename(filename))[0]
    tmp = pd.read_csv(filename,delimiter="\t", header=None)
    tmp.columns = ['name','value']
    tmp = tmp.set_index('name')
    tmp = tmp.sort_index()
    tmp = tmp["value"] + 1
    tmp = tmp[ID_list]
    tmp = np.log2(tmp) / 16 - 0.5
    filename = os.path.join(basename + ".npy")
    print(dataNb, filename, ' / ', len(files))
    np.save(os.path.join(dir, filename), tmp)

In [None]:
# get data file

filelist = "/mnt/HDD8TB/CaSPRE/files.json"
clinical = "/mnt/HDD8TB/CaSPRE/clinical.tsv"
sorted_patho = "/mnt/HDD8TB/CaSPRE/TCGA_sort"
datapath = "/mnt/HDD8TB/CaSPRE/TCGA_values/*.FPKM.txt.npy"

files = glob(datapath)
df_clin = pd.read_table(clinical, header=0, index_col=0)
f = open(filelist, 'r')
jdata = json.load(f)
cnt = 0
if not os.path.exists(sorted_patho):
    os.makedirs(sorted_patho)

for fileName in files:
    imgRootName = os.path.basename(fileName).replace(".FPKM.txt.npy", "")[-36:]
    ID = -1
    for TotImg in range(len(jdata)):
        if jdata[TotImg]['file_name'].startswith(imgRootName):
            case = jdata[TotImg]['cases'][0]['case_id']
            ID = TotImg
            break

    if ID == -1:
        print("File name not found in json file.")
        continue
    cnt += 1
    print("***** ", fileName, cnt)

    project = df_clin.loc[case].loc["project_id"]
    # gender = df_clin.loc[case].loc["gender"]
    # vital = df_clin.loc[case].loc["vital_status"]
    origin = df_clin.loc[case].loc["tissue_or_organ_of_origin"]
    # prior = df_clin.loc[case].loc["prior_malignancy"]
    patho = df_clin.loc[case].loc["primary_diagnosis"]
    # stage = df_clin.loc[case].loc["tumor_stage"]
    # ann_stage = df_clin.loc[case].loc["ann_arbor_clinical_stage"]

    # vital = vital.replace(" ", "_")
    origin = origin.replace(" ", "_")
    origin = origin.replace(",_NOS", "")
    # prior = prior.replace(" ", "_")
    # stage = str(stage)
    # stage = stage.replace(" ", "")
    # stage = stage.replace("stage", "")
    # stage = stage.replace('a','')
    # stage = stage.replace('b','')
    # ann_stage = ann_stage.replace(" ", "")
    # ann_stage = ann_stage.replace("stage", "")
    # ann_stage = ann_stage.replace('a','')
    # ann_stage = ann_stage.replace('b','')


    # pathology classification based on OncoTree

    # exclude: Acute Leukemias of Ambiguous Lineage, because of its ambiguous disease entity
    # exclude: Undifferentiated pleomorphic sarcoma, called a diagnosis of exclusion

    patho = project + '_' + patho
    patho = patho.replace(" ", "_")
    patho = patho.replace(",_NOS", "")

    patho = patho.replace("BEATAML1.0-COHORT_Acute_monoblastic_and_monocytic_leukemia", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia,_CBF-beta", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia_with_inv(3)(q21q26.2)_or_t(3;3)(q21;q26.2);_RPN1-EVI1", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia_with_mutated_CEBPA", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Mixed_phenotype_acute_leukemia,_T", "other")
    patho = patho.replace("BEATAML1.0-COHORT_Mixed_phenotype_acute_leukemia,_B", "other")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia_with_mutated_NPM1", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia_with_myelodysplasia-related_changes", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia_with_t(8;21)(q22;q22);_RUNX1-RUNX1T1", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia_with_t(9;11)(p22;q23);_MLLT3-MLL", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Myeloid_sarcoma", "other")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myelomonocytic_leukemia", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_promyelocytic_leukaemia,_PML-RAR-alpha", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_erythroid_leukaemia", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Myeloid_leukemia_associated_with_Down_Syndrome", "Acute Myeloid Leukemia")
    patho = patho.replace("BEATAML1.0-COHORT_Acute_myeloid_leukemia", "Acute Myeloid Leukemia")
    patho = patho.replace("CGCI-BLGSP_--", "other")
    patho = patho.replace("CGCI-BLGSP_Burkitt-like_lymphoma", "other")
    patho = patho.replace("CGCI-BLGSP_Burkitt_lymphoma", "Burkitt_lymphoma")
    patho = patho.replace("CPTAC-3_Adenocarcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("CPTAC-3_Endometrioid_adenocarcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("CPTAC-3_Renal_cell_carcinoma", "Kidney_Renal Clear Cell Carcinoma")
    patho = patho.replace("CTSP-DLBCL1_Diffuse_large_B-cell_lymphoma", "Diffuse Large B-Cell Lymphoma")
    patho = patho.replace("HCMI-CMDC_Adenocarcinoma", origin + " Adenocarcinoma")
    patho = patho.replace("HCMI-CMDC_Glioblastoma", "Brain_Glioblastoma")
    patho = patho.replace("MMRF-COMMPASS_--", "other")
    patho = patho.replace("MMRF-COMMPASS_Multiple_myeloma", "Multiple_Myeloma")
    patho = patho.replace("NCICCR-DLBCL_Diffuse_large_B-cell_lymphoma", "Diffuse Large B-Cell Lymphoma")        
    patho = patho.replace("TARGET-ALL-P1_Mixed_phenotype_acute_leukemia,_T/myeloid", "other")
    patho = patho.replace("TARGET-ALL-P1_T_lymphoblastic_leukemia/lymphoma", "T-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P1_Precursor_B-cell_lymphoblastic_leukemia", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P1_Mixed_phenotype_acute_leukemia,_B/myeloid", "other")
    patho = patho.replace("TARGET-ALL-P1_Mixed_phenotype_acute_leukemia_with_t(v;11q23);_MLL_rearranged", "other")
    patho = patho.replace("TARGET-ALL-P1_Undifferentiated_leukaemia", "other")
    patho = patho.replace("TARGET-ALL-P1_Mixed_phenotype_acute_leukemia_with_t(9;22)(q34;q11.2);_BCR-ABL1", "other")
    patho = patho.replace("TARGET-ALL-P1_Leukemia", "other")
    patho = patho.replace("TARGET-ALL-P1_B_lymphoblastic_leukemia/lymphoma", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P1_Juvenile_myelomonocytic_leukemia", "other")
    patho = patho.replace("TARGET-ALL-P1_--", "other")
    patho = patho.replace("TARGET-ALL-P2_Mixed_phenotype_acute_leukemia,_T/myeloid", "other")
    patho = patho.replace("TARGET-ALL-P2_T_lymphoblastic_leukemia/lymphoma", "T-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P2_Precursor_B-cell_lymphoblastic_leukemia", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P2_Mixed_phenotype_acute_leukemia,_B/myeloid", "other")
    patho = patho.replace("TARGET-ALL-P2_Mixed_phenotype_acute_leukemia_with_t(v;11q23);_MLL_rearranged", "other")
    patho = patho.replace("TARGET-ALL-P2_Undifferentiated_leukaemia", "other")
    patho = patho.replace("TARGET-ALL-P2_Mixed_phenotype_acute_leukemia_with_t(9;22)(q34;q11.2);_BCR-ABL1", "other")
    patho = patho.replace("TARGET-ALL-P2_Leukemia", "other")
    patho = patho.replace("TARGET-ALL-P2_B_lymphoblastic_leukemia/lymphoma", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P2_Juvenile_myelomonocytic_leukemia", "other")
    patho = patho.replace("TARGET-ALL-P3_Mixed_phenotype_acute_leukemia,_T/myeloid", "other")
    patho = patho.replace("TARGET-ALL-P3_T_lymphoblastic_leukemia/lymphoma", "T-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P3_Precursor_B-cell_lymphoblastic_leukemia", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P3_Mixed_phenotype_acute_leukemia,_B/myeloid", "other")
    patho = patho.replace("TARGET-ALL-P3_Mixed_phenotype_acute_leukemia_with_t(v;11q23);_MLL_rearranged", "other")
    patho = patho.replace("TARGET-ALL-P3_Undifferentiated_leukaemia", "other")
    patho = patho.replace("TARGET-ALL-P3_Mixed_phenotype_acute_leukemia_with_t(9;22)(q34;q11.2);_BCR-ABL1", "other")
    patho = patho.replace("TARGET-ALL-P3_Leukemia", "other")
    patho = patho.replace("TARGET-ALL-P3_B_lymphoblastic_leukemia/lymphoma", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P3_Juvenile_myelomonocytic_leukemia", "other")
    patho = patho.replace("TARGET-ALL-P3_Not_Reported", "other")
    patho = patho.replace("TARGET-ALL-P3_Mixed_phenotype_acute_leukemia_with_t(9;22)(q34;q11.2);_BCR-ABL1", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P3_Mixed_phenotype_acute_leukemia_with_t(v;11q23);_MLL_rearranged", "B-Lymphoblastic Leukemia-Lymphoma")
    patho = patho.replace("TARGET-ALL-P3_Acute_myeloid_leukemia", "Acute_Myeloid_Leukemia")
    patho = patho.replace("TARGET-AML_Acute_myeloid_leukemia", "Acute Myeloid Leukemia")
    patho = patho.replace("TARGET-CCSK_Clear_cell_sarcoma_of_kidney", "Kidney_Clear Cell Sarcoma of Kidney")
    patho = patho.replace("TARGET-NBL_Neuroblastoma", "Neuroblastoma-Ganglioneuroblastoma")
    patho = patho.replace("TARGET-NBL_Ganglioneuroblastoma", "Neuroblastoma-Ganglioneuroblastoma")
    patho = patho.replace("TARGET-OS_Osteosarcoma", "Bone_Osteosarcoma")
    patho = patho.replace("TARGET-RT_Malignant_rhabdoid_tumor", "Rhabdoid Cancer")
    patho = patho.replace("TARGET-WT_Wilms_tumor", "Wilms Tumor")
    patho = patho.replace("TCGA-ACC_Adrenal_cortical_carcinoma", "Adrenocortical Carcinoma")
    patho = patho.replace("TCGA-BLCA_Carcinoma", "other")
    patho = patho.replace("TCGA-BLCA_Papillary_adenocarcinoma", "other")
    patho = patho.replace("TCGA-BLCA_Papillary_transitional_cell_carcinoma", "Bladder Urothelial Carcinoma")
    patho = patho.replace("TCGA-BLCA_Squamous_cell_carcinoma", "other")
    patho = patho.replace("TCGA-BLCA_Transitional_cell_carcinoma", "Bladder Urothelial Carcinoma")        
    patho = patho.replace("TCGA-BRCA_--", "other")
    patho = patho.replace("TCGA-BRCA_Adenoid_cystic_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Apocrine_adenocarcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Basal_cell_carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Cribriform_carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Infiltrating_duct_and_lobular_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Infiltrating_duct_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Infiltrating_duct_mixed_with_other_types_of_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Infiltrating_lobular_mixed_with_other_types_of_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Intraductal_micropapillary_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Intraductal_papillary_adenocarcinoma_with_invasion", "Breast_Invasive Breast Carcinoma")        
    patho = patho.replace("TCGA-BRCA_Large_cell_neuroendocrine_carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Lobular_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Medullary_carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Metaplastic_carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Mucinous_adenocarcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Paget_disease_and_infiltrating_duct_carcinoma_of_breast", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Papillary_carcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-BRCA_Phyllodes_tumor", "other")
    patho = patho.replace("TCGA-BRCA_Pleomorphic_carcinoma", "other")
    patho = patho.replace("TCGA-BRCA_Secretory_carcinoma_of_breast", "other")
    patho = patho.replace("TCGA-BRCA_Tubular_adenocarcinoma", "Breast_Invasive Breast Carcinoma")
    patho = patho.replace("TCGA-CESC_Adenocarcinoma", "other")
    patho = patho.replace("TCGA-CESC_Adenosquamous_carcinoma", "other")
    patho = patho.replace("TCGA-CESC_Basaloid_squamous_cell_carcinoma", "Cervical Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-CESC_Endometrioid_adenocarcinoma", "Cervical Adenocarcinoma")
    patho = patho.replace("TCGA-CESC_Mucinous_adenocarcinoma", "Cervical Adenocarcinoma")
    patho = patho.replace("TCGA-CESC_Papillary_squamous_cell_carcinoma", "Cervical Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-CESC_Squamous_cell_carcinoma", "Cervical Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-CHOL_Cholangiocarcinoma", "Cholangiocarcinoma")
    patho = patho.replace("TCGA-COAD_--", "other")
    patho = patho.replace("TCGA-COAD_Adenocarcinoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-COAD_Adenosquamous_carcinoma", "other")        
    patho = patho.replace("TCGA-COAD_Carcinoma", "other")
    patho = patho.replace("TCGA-COAD_Mucinous_adenocarcinoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-COAD_Papillary_adenocarcinoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-DLBC_Diffuse_large_B-cell_lymphoma", "Diffuse Large B-Cell Lymphoma")
    patho = patho.replace("TCGA-DLBC_Malignant_lymphoma,_large_B-cell,_diffuse", "Diffuse Large B-Cell Lymphoma")
    patho = patho.replace("TCGA-ESCA_Adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-ESCA_Basaloid_squamous_cell_carcinoma", origin + " Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-ESCA_Mucinous_adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-ESCA_Squamous_cell_carcinoma", origin + " Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-ESCA_Tubular_adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-GBM_--", "other")
    patho = patho.replace("TCGA-GBM_Glioblastoma", "Brain_Glioblastoma")        
    patho = patho.replace("TCGA-HNSC_Basaloid_squamous_cell_carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-HNSC_Squamous_cell_carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-KICH_Renal_cell_carcinoma", "Kidney_Chromophobe Renal Cell Carcinoma")
    patho = patho.replace("TCGA-KIRC_Clear_cell_adenocarcinoma", "Kidney_Renal Clear Cell Carcinoma")
    patho = patho.replace("TCGA-KIRC_Renal_cell_carcinoma", "other")
    patho = patho.replace("TCGA-KIRP_Papillary_adenocarcinoma", "Kidney_Papillary Renal Cell Carcinoma")
    patho = patho.replace("TCGA-LAML_Acute_myeloid_leukemia", "Acute Myeloid Leukemia")
    patho = patho.replace("TCGA-LGG_--", "other")
    patho = patho.replace("TCGA-LGG_Astrocytoma", "Brain_Oligodendroglioma-Astrocytoma")
    patho = patho.replace("TCGA-LGG_Mixed_glioma", "Brain_Oligodendroglioma-Astrocytoma")
    patho = patho.replace("TCGA-LGG_Oligodendroglioma", "Brain_Oligodendroglioma-Astrocytoma")
    patho = patho.replace("TCGA-LIHC_Clear_cell_adenocarcinoma", "other")        
    patho = patho.replace("TCGA-LIHC_Combined_hepatocellular_carcinoma_and_cholangiocarcinoma", "other")
    patho = patho.replace("TCGA-LIHC_Hepatocellular_carcinoma", "Hepatocellular Carcinoma")
    patho = patho.replace("TCGA-LUAD_Acinar_cell_carcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUAD_Adenocarcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUAD_Bronchio-alveolar_carcinoma,_mucinous", "other")
    patho = patho.replace("TCGA-LUAD_Bronchiolo-alveolar_adenocarcinoma", "other")
    patho = patho.replace("TCGA-LUAD_Bronchiolo-alveolar_carcinoma,_non-mucinous", "other")
    patho = patho.replace("TCGA-LUAD_Clear_cell_adenocarcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUAD_Micropapillary_carcinoma", "Lung Adenocarcinoma")        
    patho = patho.replace("TCGA-LUAD_Mucinous_adenocarcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUAD_Papillary_adenocarcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUAD_Signet_ring_cell_carcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUAD_Solid_carcinoma", "Lung Adenocarcinoma")
    patho = patho.replace("TCGA-LUSC_Basaloid_squamous_cell_carcinoma", "Lung Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-LUSC_Papillary_squamous_cell_carcinoma", "Lung Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-LUSC_Squamous_cell_carcinoma,_keratinizing", "Lung Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-LUSC_Squamous_cell_carcinoma,_large_cell,_nonkeratinizing", "Lung Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-LUSC_Squamous_cell_carcinoma,_small_cell,_nonkeratinizing", "Lung Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-LUSC_Squamous_cell_carcinoma", "Lung Squamous Cell Carcinoma")
    patho = patho.replace("TCGA-MESO_Epithelioid_mesothelioma,_malignant", "Pleural Mesothelioma")        
    patho = patho.replace("TCGA-MESO_Fibrous_mesothelioma,_malignant", "Pleural Mesothelioma")
    patho = patho.replace("TCGA-MESO_Mesothelioma,_biphasic,_malignant", "Pleural Mesothelioma")
    patho = patho.replace("TCGA-MESO_Mesothelioma,_malignant", "Pleural Mesothelioma")
    patho = patho.replace("TCGA-OV_Papillary_serous_cystadenocarcinoma", "Ovary_Serous Ovarian Cancer")
    patho = patho.replace("TCGA-OV_Serous_cystadenocarcinoma", "Ovary_Serous Ovarian Cancer")
    patho = patho.replace("TCGA-PAAD_Adenocarcinoma_with_mixed_subtypes", "Pancreatic Adenocarcinoma")
    patho = patho.replace("TCGA-PAAD_Adenocarcinoma", "Pancreatic Adenocarcinoma")
    patho = patho.replace("TCGA-PAAD_Carcinoma,_undifferentiated", "other")
    patho = patho.replace("TCGA-PAAD_Infiltrating_duct_carcinoma", "Pancreatic Adenocarcinoma")
    patho = patho.replace("TCGA-PAAD_Mucinous_adenocarcinoma", "Pancreatic Adenocarcinoma")
    patho = patho.replace("TCGA-PAAD_Neuroendocrine_carcinoma", "other")        
    patho = patho.replace("TCGA-PCPG_Extra-adrenal_paraganglioma,_malignant", "Pheochromocytoma-Paraganglioma")
    patho = patho.replace("TCGA-PCPG_Extra-adrenal_paraganglioma", "Pheochromocytoma-Paraganglioma")
    patho = patho.replace("TCGA-PCPG_Paraganglioma,_malignant", "Pheochromocytoma-Paraganglioma")
    patho = patho.replace("TCGA-PCPG_Paraganglioma", "Pheochromocytoma-Paraganglioma")
    patho = patho.replace("TCGA-PCPG_Pheochromocytoma,_malignant", "Pheochromocytoma-Paraganglioma")
    patho = patho.replace("TCGA-PCPG_Pheochromocytoma", "Pheochromocytoma-Paraganglioma")
    patho = patho.replace("TCGA-PRAD_Adenocarcinoma_with_mixed_subtypes", "Prostate Adenocarcinoma")
    patho = patho.replace("TCGA-PRAD_Adenocarcinoma", "Prostate Adenocarcinoma")
    patho = patho.replace("TCGA-PRAD_Infiltrating_duct_carcinoma", "Prostate Adenocarcinoma")
    patho = patho.replace("TCGA-PRAD_Mucinous_adenocarcinoma", "Prostate Adenocarcinoma")
    patho = patho.replace("TCGA-READ_--", "other")
    patho = patho.replace("TCGA-READ_Adenocarcinoma_in_tubolovillous_adenoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-READ_Adenocarcinoma_with_mixed_subtypes", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-READ_Adenocarcinoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-READ_Mucinous_adenocarcinoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-READ_Tubular_adenocarcinoma", "Colorectal Adenocarcinoma")
    patho = patho.replace("TCGA-SARC_Abdominal_fibromatosis", "other")
    patho = patho.replace("TCGA-SARC_Aggressive_fibromatosis", "other")
    patho = patho.replace("TCGA-SARC_Dedifferentiated_liposarcoma", "STS_Dedifferentiated liposarcoma")
    patho = patho.replace("TCGA-SARC_Fibromyxosarcoma", "STS_Myxofibrosarcoma")
    patho = patho.replace("TCGA-SARC_Giant_cell_sarcoma", "other")
    patho = patho.replace("TCGA-SARC_Leiomyosarcoma", "STS_Leiomyosarcoma")
    patho = patho.replace("TCGA-SARC_Liposarcoma,_well_differentiated", "other")
    patho = patho.replace("TCGA-SARC_Malignant_fibrous_histiocytoma", "other")
    patho = patho.replace("TCGA-SARC_Malignant_peripheral_nerve_sheath_tumor", "other")
    patho = patho.replace("TCGA-SARC_Myxoid_leiomyosarcoma", "STS_Leiomyosarcoma")
    patho = patho.replace("TCGA-SARC_Pleomorphic_liposarcoma", "other")
    patho = patho.replace("TCGA-SARC_Synovial_sarcoma,_biphasic", "STS_Synovial Sarcoma")
    patho = patho.replace("TCGA-SARC_Synovial_sarcoma,_spindle_cell", "STS_Synovial Sarcoma")
    patho = patho.replace("TCGA-SARC_Synovial_sarcoma", "STS_Synovial Sarcoma")
    patho = patho.replace("TCGA-SARC_Undifferentiated_sarcoma", "other")
    patho = patho.replace("TCGA-SKCM_Acral_lentiginous_melanoma,_malignant", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Desmoplastic_melanoma,_malignant", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Amelanotic_melanoma", "Cutaneous Melanoma")        
    patho = patho.replace("TCGA-SKCM_Epithelioid_cell_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Lentigo_maligna_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Malignant_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Mixed_epithelioid_and_spindle_cell_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Nodular_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Spindle_cell_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-SKCM_Superficial_spreading_melanoma", "Cutaneous Melanoma")
    patho = patho.replace("TCGA-STAD_Adenocarcinoma_with_mixed_subtypes", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-STAD_Adenocarcinoma,_intestinal_type", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-STAD_Adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-STAD_Carcinoma,_diffuse_type", "Esophagogastric Adenocarcinoma")        
    patho = patho.replace("TCGA-STAD_Mucinous_adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-STAD_Papillary_adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-STAD_Signet_ring_cell_carcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-STAD_Tubular_adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("TCGA-TGCT_--", "other")
    patho = patho.replace("TCGA-TGCT_Embryonal_carcinoma", "GCT_Non-Seminomatous Germ Cell Tumor")
    patho = patho.replace("TCGA-TGCT_Mixed_germ_cell_tumor", "GCT_Non-Seminomatous Germ Cell Tumor")
    patho = patho.replace("TCGA-TGCT_Seminoma", "GCT_Seminoma")
    patho = patho.replace("TCGA-TGCT_Teratocarcinoma", "GCT_Non-Seminomatous Germ Cell Tumor")
    patho = patho.replace("TCGA-TGCT_Teratoma,_benign", "GCT_Non-Seminomatous Germ Cell Tumor")
    patho = patho.replace("TCGA-TGCT_Teratoma,_malignant", "GCT_Non-Seminomatous Germ Cell Tumor")
    patho = patho.replace("TCGA-TGCT_Yolk_sac_tumor", "GCT_Non-Seminomatous Germ Cell Tumor")
    patho = patho.replace("TCGA-THCA_Carcinoma", "other")
    patho = patho.replace("TCGA-THCA_Follicular_carcinoma,_minimally_invasive", "other")
    patho = patho.replace("TCGA-THCA_Follicular_adenocarcinoma", "other")
    patho = patho.replace("TCGA-THCA_Nonencapsulated_sclerosing_carcinoma", "other")
    patho = patho.replace("TCGA-THCA_Oxyphilic_adenocarcinoma", "other")
    patho = patho.replace("TCGA-THCA_Papillary_carcinoma,_columnar_cell", "Papillary Thyroid Cancer")        
    patho = patho.replace("TCGA-THCA_Papillary_carcinoma,_follicular_variant", "Papillary Thyroid Cancer")
    patho = patho.replace("TCGA-THCA_Papillary_adenocarcinoma", "Papillary Thyroid Cancer")
    patho = patho.replace("TCGA-THYM_Thymic_carcinoma", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_AB,_malignant", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_AB", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_A,_malignant", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_A", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_B1,_malignant", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_B1", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_B2,_malignant", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_B2", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-THYM_Thymoma,_type_B3,_malignant", "Thymic Epithelial Tumor")
    patho = patho.replace("TCGA-UCEC_Adenocarcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Carcinoma,_undifferentiated", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Clear_cell_adenocarcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Endometrioid_adenocarcinoma,_secretory_variant", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Endometrioid_adenocarcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Not_Reported", "other")
    patho = patho.replace("TCGA-UCEC_Papillary_serous_cystadenocarcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Serous_cystadenocarcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCEC_Serous_surface_papillary_carcinoma", "Uterine Endometrial Carcinoma")
    patho = patho.replace("TCGA-UCS_Carcinosarcoma", "Uterine Carcinosarcoma")
    patho = patho.replace("TCGA-UCS_Mesodermal_mixed_tumor", "Uterine Carcinosarcoma")
    patho = patho.replace("TCGA-UCS_Mullerian_mixed_tumor", "Uterine Carcinosarcoma")
    patho = patho.replace("TCGA-UVM_Epithelioid_cell_melanoma", "Uveal Melanoma")
    patho = patho.replace("TCGA-UVM_Malignant_melanoma", "Uveal Melanoma")
    patho = patho.replace("TCGA-UVM_Mixed_epithelioid_and_spindle_cell_melanoma", "Uveal Melanoma")
    patho = patho.replace("TCGA-UVM_Spindle_cell_melanoma,_type_B", "Uveal Melanoma")
    patho = patho.replace("TCGA-UVM_Spindle_cell_melanoma", "Uveal Melanoma")

    patho = patho.replace("Esophagus", "Esophageal")
    patho = patho.replace("Rectum", "Rectal")
    patho = patho.replace("Rectosigmoid junction", "Rectal")
    patho = patho.replace("Stomach Squamous Cell Carcinoma", "other")
    patho = patho.replace("Unknown primary site Mucinous adenocarcinoma", "other")

    patho = patho.replace(" ", "_")

    patho = patho.replace(",_anaplastic", "")
    patho = patho.replace("Head_and_Neck_Squamous_Cell_Carcinoma,_keratinizing", "Head_and_Neck_Squamous_Cell_Carcinoma")
    patho = patho.replace("Head_and_Neck_Squamous_Cell_Carcinoma,_large_cell,_nonkeratinizing", "Head_and_Neck_Squamous_Cell_Carcinoma")
    patho = patho.replace("Head_and_Neck_Squamous_Cell_Carcinoma,_spindle_cell", "Head_and_Neck_Squamous_Cell_Carcinoma")
    patho = patho.replace(",_malignant", "")
    patho = patho.replace("Cardia_Squamous_Cell_Carcinoma", "other")
    patho = patho.replace("Cardia_Adenocarcinoma", "other")
    patho = patho.replace("Cecum_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Cervical_Squamous_Cell_Carcinoma,_keratinizing", "Cervical_Squamous_Cell_Carcinoma")
    patho = patho.replace("Cervical_Squamous_Cell_Carcinoma,_large_cell,_nonkeratinizing", "Cervical_Squamous_Cell_Carcinoma")
    patho = patho.replace(",_endocervical_type", "")

    patho = patho.replace("Rectal_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Colon_Adenocarcinoma", "Colorectal_Adenocarcinoma")

    patho = patho.replace("Acute_Myeloid_Leukemia,_minimal_differentiation", "Acute_Myeloid_Leukemia")
    patho = patho.replace("Acute_Myeloid_Leukemia_with_maturation", "Acute_Myeloid_Leukemia")
    patho = patho.replace("Acute_Myeloid_Leukemia_without_maturation", "Acute_Myeloid_Leukemia")

    patho = patho.replace("Colorectal_Adenocarcinoma_with_mixed_subtypes", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Colorectal_Adenocarcinoma_with_neuroendocrine_differentiation", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Descending_colon_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Esophageal_Squamous_Cell_Carcinoma,_keratinizing", "Esophageal_Squamous_Cell_Carcinoma")
    patho = patho.replace("Floor_of_mouth_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Floor_of_mouth_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Gum_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Gum_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Hard_palate_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Head_and_Neck_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Hepatic_flexure_of_colon_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Hepatocellular_Carcinoma,_clear_cell_type", "Hepatocellular Carcinoma")
    patho = patho.replace("Hepatocellular_Carcinoma,_fibrolamellar", "Hepatocellular Carcinoma")
    patho = patho.replace("Hepatocellular_Carcinoma,_spindle_cell_variant", "Hepatocellular Carcinoma")
    patho = patho.replace("Hypopharynx_Squamous_Cell_Carcinoma,_large_cell,_nonkeratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Hypopharynx_Squamous_Cell_Carcinoma,_large_cell,_nonkeratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Chromophobe_Renal_Cell_Carcinoma,_chromophobe_type", "Chromophobe_Renal_Cell_Carcinoma")
    patho = patho.replace("Larynx_Squamous_Cell_Carcinoma,_large_cell,_nonkeratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Lip_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Lip_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Lower_gum_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Lower_third_of_esophagus_Squamous_Cell_Carcinoma,_keratinizing", "Esophageal Squamous Cell Carcinoma")
    patho = patho.replace("Lower_third_of_esophagus_Adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("Lower_third_of_esophagus_Squamous_Cell_Carcinoma", "Esophageal Squamous Cell Carcinoma")
    patho = patho.replace("Lung_Adenocarcinoma_with_mixed_subtypes", "Lung_Adenocarcinoma")
    patho = patho.replace("Mandible_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Middle_third_of_esophagus_Squamous_Cell_Carcinoma", "Esophageal Squamous Cell Carcinoma")
    patho = patho.replace("Middle_third_of_esophagus_Adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("Mouth_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Mouth_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Mucinous_Adenocarcinoma_of_the_Colon_and_Rectal", "Colorectal Adenocarcinoma")
    patho = patho.replace("Burkitt_lymphoma_(Includes_all_variants)", "Burkitt_lymphoma")
    patho = patho.replace("Oropharynx_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Oropharynx_Squamous_Cell_Carcinoma,_spindle_cell", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Overlapping_lesion_of_lip,_oral_cavity_and_pharynx_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Palate_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Pharynx_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Phyllodes_Tumor_of_the_Breast,_malignant", "other")
    patho = patho.replace("Posterior_wall_of_oropharynx_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Rectosigmoid_junction_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Renal_Non-Clear_Cell_Carcinoma,_chromophobe_type", "Renal_Non-Clear_Cell_Carcinoma")
    patho = patho.replace("Retromolar_area_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Sigmoid_colon_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Splenic_flexure_of_colon_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Supraglottis_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Thoracic_esophagus_Adenocarcinoma", "Esophagogastric Adenocarcinoma")
    patho = patho.replace("Tongue_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Tongue_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Tonsil_Squamous_Cell_Carcinoma,_keratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Tonsil_Squamous_Cell_Carcinoma,_large_cell,_nonkeratinizing", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Tonsil_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Transverse_colon_Adenocarcinoma", "Colorectal_Adenocarcinoma")
    patho = patho.replace("Unknown_primary_site_Adenocarcinoma", "other")
    patho = patho.replace("Upper_Gum_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Upper_third_of_esophagus_Squamous_Cell_Carcinoma", "Esophageal Squamous Cell Carcinoma")
    patho = patho.replace("Ventral_surface_of_tongue_Squamous_Cell_Carcinoma", "Head and Neck Squamous Cell Carcinoma")
    patho = patho.replace("Connective,_subcutaneous_and_other_soft_tissues_of_abdomen_Adenocarcinoma", "other")

    patho = patho.replace("Acute_Myeloid_Leukemia", "Myeloid_Acute_Myeloid_Leukemia")
    patho = patho.replace("Thymic_Epithelial_Tumor", "Thymus_Thymic_Epithelial_Tumor")
    patho = patho.replace("Burkitt_lymphoma", "Lymphoid_Burkitt_lymphoma")
    patho = patho.replace("B-Lymphoblastic_Leukemia-Lymphoma", "Lymphoid_B-Lymphoblastic_Leukemia-Lymphoma")
    patho = patho.replace("Diffuse_Large_B-Cell_Lymphoma", "Lymphoid_Diffuse_Large_B-Cell_Lymphoma")
    patho = patho.replace("Multiple_Myeloma", "Lymphoid_Multiple_Myeloma")
    patho = patho.replace("Papillary_Thyroid_Cancer", "Thyroid_Papillary_Cancer")
    patho = patho.replace("T-Lymphoblastic_Leukemia-Lymphoma", "Lymphoid_T-Lymphoblastic_Leukemia-Lymphoma")

    patho = patho.replace(" ", "_")

    SubDir = sorted_patho + '/' + patho
    if not os.path.exists(SubDir):
        os.makedirs(SubDir)
    NewImageDir = os.path.join(SubDir, ("{}_{}.npy".format(os.path.basename(fileName).replace(".FPKM.txt.npy", ""), project)))
    os.symlink(fileName, NewImageDir)


In [None]:
data_dir =  "/mnt/HDD8TB/CaSPRE/TCGA_sort"
result_dir =  "/mnt/HDD8TB/CaSPRE/TCGA_TOP_genes"
if not os.path.exists(result_dir):
    os.mkdir(result_dir)

# loading training data
path = data_dir + "/*/*.npy"
files = glob(path)
files.sort()
trainImg = []
trainLabel = []
j = 1
num = len(files)

# making a classification list
for img in files:
    print("\r" + str(j) + "/" + str(num) , end="")
    label = pathlib.Path(img).parent.name
    trainImg.append(np.expand_dims(np.load(img), axis=0))
    trainLabel.append(label)
    j += 1
image_list = np.asarray(trainImg)
trainLabel = pd.DataFrame(trainLabel)

# saving a file
np.savez_compressed(result_dir + "/TCGA_image.npz", image_list)
trainLabel.to_csv(result_dir + "/TCGA_label.tsv", index=False, sep='\t')


In [289]:
n_categories

46

In [290]:
##### learning with 1D-CNNs
# TOP v6, 525 genes
# only primary tumor samples
# major 46 pathogolic types from GDC database: 11,286 samples

accuracy_table = []
index_list = np.load("/mnt/HDD8TB/CaSPRE/index_list.npy")
trainLabel = pd.read_table(result_dir + "/TCGA_label.tsv", header=0)
label_List = np.asarray(trainLabel).reshape(-1,)
le = LabelEncoder()
le = le.fit(label_List)
label_List = le.transform(label_List)
tmp = trainLabel["0"].unique()
tmp.sort()
tmp1 = pd.DataFrame(tmp)
cal_weights = cal_weight(tmp, data_dir)

kf = StratifiedKFold(n_splits=n_Splits, shuffle=True, random_state = seed1)

image_list = np.load(result_dir + "/TCGA_image.npz")['arr_0'].astype(np.float32)
image_list = np.reshape(image_list, (-1, 1, 1375))[:,:,index_list]

gc.collect()

# 5-fold cross validation
# train:valid:test = 60:20:20
# same separation in any batch
cv_batch = -1
for train_valid_index, test_index in kf.split(image_list, label_List):
    cv_batch += 1
    if(cv_batch != 5):
        label_list = np_utils.to_categorical(label_List, n_categories)
        np.save(result_dir + "/test_index_" + str(cv_batch) + ".npy", test_index)
        X_train, X_valid, y_train, y_valid = train_test_split(image_list[train_valid_index], label_list[train_valid_index], random_state=seed2, test_size=split, stratify=label_list[train_valid_index])
        X_test = image_list[test_index]
        y_test = label_list[test_index]

        gc.collect()

        # training
        inputs = Input(shape=(1, X_size))
        x = Conv1D(256, 16, strides=1, padding='same', kernel_initializer='he_normal')(inputs)
        x = ReLU(max_value=None)(x)
        x = BatchNormalization(axis=-1)(x,training=True)
        x = Conv1D(32, 64, strides=1, padding='same', kernel_initializer='he_normal')(x)
        x = ReLU(max_value=None)(x)
        x = BatchNormalization(axis=-1)(x,training=True)
        x = Conv1D(16, 128, strides=1, padding='same', kernel_initializer='he_normal')(x)
        x = ReLU(max_value=None)(x)
        x = BatchNormalization(axis=-1)(x,training=True)
        x = Flatten(name='flatten')(x)
        x = Dense(512, activation='relu', name='fc1')(x)
        x = Dropout(0.1, name='dropout1')(x,training=True)

        predictions = Dense(n_categories, activation='softmax', name='predictions')(x)
        VGG_model = Model(inputs=inputs, outputs=predictions)
        VGG_model.compile(loss='categorical_crossentropy',
                  optimizer=tf.keras.optimizers.SGD(lr=learning, momentum=0.9, decay=1e-4, nesterov=True),
                  metrics=["acc", Precision(), Recall()])
        if cv_batch == 0:
            VGG_model.save_weights(result_dir + "/initial_weights.h5")
        else:
            VGG_model.load_weights(result_dir + "/initial_weights.h5")

        gc.collect()
        early_stopping = EarlyStopping(monitor='val_loss', patience=patience , verbose=0, restore_best_weights=True)
        checkpointer = ModelCheckpoint(monitor='val_loss', filepath = result_dir + "/weight_" + str(cv_batch) + ".h5", save_weights_only=True, verbose=0, mode='auto', rsave_best_only=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_delta=0.001, min_lr=0.000001)
        print("\rtraining: " + str(cv_batch), end='')

        history = VGG_model.fit(X_train, y_train,
                            epochs=epochs,
                            class_weight=cal_weights,
                            validation_data=(X_valid, y_valid),
                            verbose=0,
                            shuffle=True,
                            callbacks=[reduce_lr, early_stopping, checkpointer])
        del VGG_model
        del x, inputs, predictions

        inputs = Input(shape=(1, X_size))
        x = Conv1D(256, 16, strides=1, padding='same', kernel_initializer='he_normal')(inputs)
        x = ReLU(max_value=None)(x)
        x = BatchNormalization(axis=-1)(x,training=False)
        x = Conv1D(32, 64, strides=1, padding='same', kernel_initializer='he_normal')(x)
        x = ReLU(max_value=None)(x)
        x = BatchNormalization(axis=-1)(x,training=False)
        x = Conv1D(16, 128, strides=1, padding='same', kernel_initializer='he_normal')(x)
        x = ReLU(max_value=None)(x)
        x = BatchNormalization(axis=-1)(x,training=False)
        x = Flatten(name='flatten')(x)
        x = Dense(512, activation='relu', name='fc1')(x)
        x = Dropout(0.1, name='dropout1')(x,training=False)

        predictions = Dense(n_categories, activation='softmax', name='predictions')(x)
        VGG_model = Model(inputs=inputs, outputs=predictions)

        VGG_model.load_weights(result_dir + "/weight_" + str(cv_batch) + ".h5")
        y_pred = VGG_model.predict(X_test, batch_size=batch_size, verbose=0)
        tmp2 = pd.DataFrame(confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
        pd.concat([tmp1, tmp2], axis=1).to_csv(result_dir + "/confusion_matrix_" + str(cv_batch) + ".tsv", index=False, sep='\t')
        np.save(result_dir + "/predict_" + str(cv_batch) + ".npy", y_pred)

        xlab = np.arange(0, len(history.history["acc"])) + 1

        # model evaluation
        acc_txt = str(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
        f1_txt = str(f1_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1), average="macro"))
        save_txt = "Cross_validation_" + str(cv_batch) + ', Test accuracy/F1 score : ' + acc_txt + ' / ' + f1_txt + "\n"
        f = open(acc_name, 'a')
        f.write(save_txt)
        f.close()
        accuracy_table.append([cv_batch, accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)),f1_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1), average="macro")])
        pd.DataFrame(np.asarray(accuracy_table)).to_csv(acc_table, index=False, sep='\t')

        #plot accuracy
        plt.plot(range(1, len(history.history["acc"])+1), history.history["acc"], label = "Training acc" )
        plt.plot(range(1, len(history.history["val_acc"])+1), history.history["val_acc"], label = "Validation acc")
        plt.title("Training and Validation accuracy")
        plt.legend()
        plt.savefig(result_dir + "/accuracy_" + str(cv_batch) + ".png")
        plt.clf()

        #plot loss
        plt.plot(range(1, len(history.history["loss"])+1), history.history["loss"],  label = "Training loss" )
        plt.plot(range(1, len(history.history["val_loss"])+1), history.history["val_loss"], label = "Validation loss")
        plt.title("Training and Validation loss")
        plt.legend()
        plt.savefig(result_dir + "/loss_" + str(cv_batch) + ".png")
        plt.clf()

        K.clear_session()
        gc.collect()
        del VGG_model
        del x, inputs, predictions
        del history
        del early_stopping
        del checkpointer, reduce_lr
        del X_train, X_valid, X_test


# ensembling
print("ensembling")
predict_ensemble = np.zeros((len(label_list),n_categories))

for cv_batch in range(0, n_Splits):
    test_index = np.load(result_dir + "/test_index_" + str(cv_batch) + ".npy")
    predict = np.load(result_dir + "/predict_" + str(cv_batch) + ".npy")
    for i in range(len(test_index)):
        predict_ensemble[test_index[i]] = predict[i]
y_test = label_list
tmp2 = pd.DataFrame(confusion_matrix(np.argmax(y_test, axis=1), np.argmax(predict_ensemble, axis=1)))
pd.concat([tmp1, tmp2], axis=1).to_csv(result_dir + "/confusion_matrix_total.tsv", index=False, sep='\t')
np.save(result_dir + "/predict_total.npy", predict_ensemble)

acc_txt = str(accuracy_score(np.argmax(y_test, axis=1), np.argmax(predict_ensemble, axis=1)))
f1_txt = str(f1_score(np.argmax(y_test, axis=1), np.argmax(predict_ensemble, axis=1), average="macro"))
save_txt = 'Total, Test accuracy/F1 score : ' + acc_txt + ' / ' + f1_txt +  "\n"
f = open(acc_name, 'a')
f.write(save_txt)
f.close()


training: 4ensembling


In [None]:
# Normal sample RNA-seq FPKM data
datapath = "/mnt/HDD8TB/CaSPRE/Normal_RNA_values/*.npy"
files = glob(datapath)  
files = sorted(files)

Normal_RNA = np.zeros((len(files),1375))

for dataNb in range(len(files)):
    filename = files[dataNb]
    Normal_RNA[dataNb, :] = np.load(filename)

# Normal sample TOP-RNA FPKM data
datapath = "/mnt/HDD8TB/CaSPRE/Normal_TOP_values/*.npy"
files = glob(datapath)  
files = sorted(files)

Normal_TOP = np.zeros((len(files),1375))

for dataNb in range(len(files)):
    filename = files[dataNb]
    Normal_TOP[dataNb, :] = np.load(filename)



In [None]:
const_mean = np.zeros(1375)
const_se = np.zeros(1375)
intercept_mean = np.zeros(1375)
intercept_se = np.zeros(1375)
pred_mean_se = np.zeros(1375)
pred_residual_se = np.zeros(1375)
index_list = []
a = 1

for i in range(1375):
    print("\r" + str(i) + "/ 1375", end="")
    x = np.power(16, Normal_TOP[:, i] + 0.5) - 1
    y = np.power(16, Normal_RNA[:, i] + 0.5) - 1
    index = x * y > 0
    y = y[index]
    x = x[index]

    if len(y) >= 30:
        X = sm.add_constant(x)
        olsmod = sm.OLS(y, X)
        olsres = olsmod.fit()

        st, data, ss2 = summary_table(olsres, alpha=0.05)
        prstd, iv_l, iv_u = wls_prediction_std(olsres)

        fittedvalues = data[:, 2]
        predict_mean_se  = data[:, 3]
        predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T
        predict_ci_low, predict_ci_upp = data[:, 6:8].T
        predict_se  = data[:, 9]

        if olsres.rsquared > 0.8:
            index_list.append(i)
            const_mean[i] = olsres.params[0]
            const_se[i] = olsres.bse[0]
            intercept_mean[i] = olsres.params[1]
            intercept_se[i] = olsres.bse[1]
            pred_mean_se[i] = predict_mean_se.mean()
            pred_residual_se[i] = predict_se.mean()

            if a == 0:
                fig = plt.figure()
                plt.clf()
                plt.plot(x, y, 'o')
                plt.plot(x, fittedvalues, '-', lw=2)
                plt.plot(x, predict_ci_low, 'r--', lw=2)
                plt.plot(x, predict_ci_upp, 'r--', lw=2)
                plt.plot(x, predict_mean_ci_low, 'r--', lw=2)
                plt.plot(x, predict_mean_ci_upp, 'r--', lw=2)
                plt.title('FPKM - X: TOP-RNA, Y: RNA-seq')
                fig.savefig("/mnt/HDD8TB/CaSPRE/figure_regression/regression_{:0>4}.png".format(i))
    
np.save("/mnt/HDD8TB/CaSPRE/index_list.npy", np.array(index_list))

In [None]:
# Transform tumor sample TOP-RNA FPKM data to RNA-seq FPKM
# Considering the residual errors in prediction
# Tentatively, 1,000 expression sets per sample were generated
# Tumor TOP files should be saved in tumor name directory

# loading training data
path = "/mnt/HDD8TB/CaSPRE/Tumor_TOP_values/*/*.npy"
files = glob(path)
files.sort()
trainImg = []
trainLabel = []
j = 0
num = len(files)
replicate = 100

# expression data generation
for img in files:
    j = j + 1
    for repli in range(replicate): 
        print("\r" + str(repli) + "/" + str(replicate) + "," + str(j) + "/" + str(num) , end="")
        tmp = np.load(img)
        # 正規分布でのランダムサンプリングでベクトル作成
        error_norm_1 = np.random.normal(loc = 0, scale = 1, size = 1375)
        error_norm_2 = np.random.normal(loc = 0, scale = 1, size = 1375)
        tmp = np.power(16, tmp + 0.5) - 1
        tmp = const_mean + tmp * intercept_mean + pred_mean_se * error_norm_1 + error_norm_2 * pred_residual_se
        tmp[tmp < 0] = 0
        tmp = np.log2(tmp + 1) / 16 - 0.5
        trainImg.append(np.expand_dims(tmp, axis=0))
        image_list = np.asarray(trainImg)
        label = pathlib.Path(img).parent.name
        if label == "Brain_Oligodendroglioma":
            label = "Brain_Oligodendroglioma-Astrocytoma"
        trainLabel.append(label)
np.savez_compressed("/mnt/HDD8TB/CaSPRE/Tumor_transformed/Tumor_TOP_image.npz", image_list)
trainLabel = pd.DataFrame(trainLabel)
trainLabel.to_csv("/mnt/HDD8TB/CaSPRE/Tumor_transformed/Tumor_TOP_label.tsv", index=False, sep='\t')



In [291]:
##### test for Tumor TOP data
# TOP v6, 1,097 genes
# subclassidfied to major 46 pathogolic types

accuracy_table = []
index_list = np.load("/mnt/HDD8TB/CaSPRE/index_list.npy")
testLabel = pd.read_table("/mnt/HDD8TB/CaSPRE/Tumor_transformed/Tumor_TOP_label.tsv", header=0)
trainLabel = pd.read_table(result_dir + "/TCGA_label.tsv", header=0)
label_List_full = np.asarray(trainLabel).reshape(-1,)
le = LabelEncoder()
le = le.fit(label_List_full)

label_List = np.asarray(testLabel).reshape(-1,)
label_List = le.transform(label_List)
tmp = trainLabel["0"].unique()
tmp.sort()
tmp1 = pd.DataFrame(tmp)
y_test = np_utils.to_categorical(label_List, n_categories)

image_list = np.load("/mnt/HDD8TB/CaSPRE/Tumor_transformed/Tumor_TOP_image.npz")['arr_0'].astype(np.float32)
X_test = np.reshape(image_list, (-1, 1, 1375))[:,:,index_list]

gc.collect()

# 5-fold cross validation
# train:valid:test = 60:20:20
# same separation in any batch

for cv_batch in range(n_Splits):
    inputs = Input(shape=(1, X_size))
    x = Conv1D(256, 16, strides=1, padding='same', kernel_initializer='he_normal')(inputs)
    x = ReLU(max_value=None)(x)
    x = BatchNormalization(axis=-1)(x,training=False)
    x = Conv1D(32, 64, strides=1, padding='same', kernel_initializer='he_normal')(x)
    x = ReLU(max_value=None)(x)
    x = BatchNormalization(axis=-1)(x,training=False)
    x = Conv1D(16, 128, strides=1, padding='same', kernel_initializer='he_normal')(x)
    x = ReLU(max_value=None)(x)
    x = BatchNormalization(axis=-1)(x,training=False)
    x = Flatten(name='flatten')(x)
    x = Dense(512, activation='relu', name='fc1')(x)
    x = Dropout(0.1, name='dropout1')(x,training=False)

    predictions = Dense(n_categories, activation='softmax', name='predictions')(x)
    VGG_model = Model(inputs=inputs, outputs=predictions)

    VGG_model.load_weights(result_dir + "/weight_" + str(cv_batch) + ".h5")
    y_pred = VGG_model.predict(X_test, batch_size=1, verbose=1)
    tmp2 = pd.DataFrame(confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
    pd.concat([tmp1, tmp2], axis=1).to_csv(result_dir + "/test_confusion_matrix_" + str(cv_batch) + ".tsv", index=False, sep='\t')
    np.save(result_dir + "/test_predict_" + str(cv_batch) + ".npy", y_pred)

    # model evaluation
    acc_txt = str(accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))
    f1_txt = str(f1_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1), average="macro"))
    save_txt = "test: Cross_validation_" + str(cv_batch) + ', Test accuracy/F1 score : ' + acc_txt + ' / ' + f1_txt + "\n"
    f = open(acc_name, 'a')
    f.write(save_txt)
    f.close()
    accuracy_table.append([cv_batch, accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)),f1_score(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1), average="macro")])
    pd.DataFrame(np.asarray(accuracy_table)).to_csv(acc_table, index=False, sep='\t')

    K.clear_session()
    gc.collect()
    del VGG_model
    del x, inputs, predictions


# ensembling
print("ensembling")
predict_ensemble = np.zeros((len(testLabel),n_categories))
predict_total = np.zeros(int(len(testLabel) / replicate))


for cv_batch in range(0, n_Splits):
    predict = np.load(result_dir + "/test_predict_" + str(cv_batch) + ".npy")
    predict_ensemble += predict

for i in range(0, int(len(testLabel) / replicate)):
    predict_total[i] = np.argmax(predict_ensemble[(i * 100):((i + 1) * 100)].sum(axis=0))

#tmp2 = pd.DataFrame(confusion_matrix(np.argmax(y_test, axis=1), np.argmax(predict_ensemble, axis=1), labels = tmp))
#pd.concat([tmp1, tmp2], axis=1).to_csv(result_dir + "/test_confusion_matrix_total.tsv", index=False, sep='\t')
#np.save(result_dir + "/test_predict_total.npy", predict_ensemble)

#acc_txt = str(accuracy_score(np.argmax(y_test, axis=1), np.argmax(predict_ensemble, axis=1)))
#f1_txt = str(f1_score(np.argmax(y_test, axis=1), np.argmax(predict_ensemble, axis=1), average="macro"))
#save_txt = 'test: Total, Test accuracy/F1 score : ' + acc_txt + ' / ' + f1_txt +  "\n"
#f = open(acc_name, 'a')
#f.write(save_txt)
#f.close()



ensembling


In [292]:
predict_total

array([ 9., 20., 20., 20.,  9., 20., 21., 21., 21., 20., 21., 20., 21.,
       21., 21., 21.,  9., 36., 21., 21., 20., 21.])

In [None]:
H17-14170-1	000101	57	男	FFPE 軟部 右大腿	Undifferentiated pleomorphic sarcoma
H17-13949-21	000103	58	男	FFPE S状結腸	大腸癌(腺癌)
H17-13858-11-1	000114	67	男	FFPE  肺	1 大細胞神経内分泌癌
H17-10301-4	000022	80	男	FFPE 肺	扁平上皮癌
H17-09418-26	000104	43	女	FFPE 子宮体部1	類内膜腺癌
H17-08856-10	000199 	42	女	FFPE 卵巣(右) 	卵巣癌(高異型度漿液性腺癌) 
H17-07831-1	000082	50	男	FFPE 肺	肺腺癌
H17-06836-1	000078	48	男	FFPE 肺	肺非小細胞癌(腺癌疑い)
H17-04518-1	000080	84	男	FFPE リンパ節	肺腺癌のリンパ節転移
H17-03631-3	000018	73	男	FFPE 肺	混合型小細胞癌の小細胞癌成分
H17-02218-4	000067	74	男	FFPE 肺	肺腺癌
H16-15379-54	000170 	49	女	FFPE 左卵巣 	卵巣癌(高悪性度漿液性腺癌) 
H16-03407-9	000175	40	女	FFPE 右卵巣	卵巣癌(高悪性度漿液性腺癌)
H15-07997-51	000124	62	男	FFPE S状結腸	大腸癌(腺癌)
H12-05689-30	000141	68	女	FFPE 子宮体部	子宮体癌(低分化腺癌)

In [293]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 5797536497120142750, name: "/device:XLA_CPU:0"
 device_type: "XLA_CPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 7646843998793670716
 physical_device_desc: "device: XLA_CPU device", name: "/device:XLA_GPU:0"
 device_type: "XLA_GPU"
 memory_limit: 17179869184
 locality {
 }
 incarnation: 14743669432060117660
 physical_device_desc: "device: XLA_GPU device"]