In [1]:
import numpy as np
import cptac
import pandas as pd
from predict_protein import download_cptac, select_features, train_model
from sklearn.preprocessing import StandardScaler, RobustScaler

In [2]:
# List current CPTAC datasets
cptac.list_datasets()


Unnamed: 0_level_0,Description,Data reuse status,Publication link
Dataset name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brca,breast cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33212010/
Ccrcc,clear cell renal cell carcinoma (kidney),no restrictions,https://pubmed.ncbi.nlm.nih.gov/31675502/
Colon,colorectal cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/31031003/
Endometrial,endometrial carcinoma (uterine),no restrictions,https://pubmed.ncbi.nlm.nih.gov/32059776/
Gbm,glioblastoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33577785/
Hnscc,head and neck squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33417831/
Lscc,lung squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34358469/
Luad,lung adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/32649874/
Ovarian,high grade serous ovarian cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/27372738/
Pdac,pancreatic ductal adenocarcinoma,password access only,unpublished


In [3]:
cptac.download(dataset="endometrial")
cptac.download(dataset="ovarian")
cptac.download(dataset="colon")
cptac.download(dataset="brca")
cptac.download(dataset="luad")
cptac.download(dataset="ccrcc")
cptac.download(dataset="gbm")
cptac.download(dataset="lscc")
cptac.download(dataset="hnscc")

Checking that hnscc index is up-to-date....                                                                                                                                                                                                                                                                                                                                                

True

In [4]:
# Download and format. These are currently done one by one, we may want to turn this into a function

en = cptac.Endometrial()
ov = cptac.Ovarian()
co = cptac.Colon()
br = cptac.Brca()
lu = cptac.Luad()
cc = cptac.Ccrcc()
gb = cptac.Gbm()
ls = cptac.Lscc()
hn = cptac.Hnscc()

# For endometrial, try getting the RNA and protein data
#en_rna = en.get_transcriptomics()
#en_pro = en.get_proteomics()
a = en.join_omics_to_omics('transcriptomics', 'proteomics')

#ov_rna = ov.get_transcriptomics()
#ov_pro = ov.get_proteomics()
b = ov.join_omics_to_omics('transcriptomics', 'proteomics')
b.columns = b.columns.droplevel(1)

#co_rna = co.get_transcriptomics()
#co_pro = co.get_proteomics()
c = co.join_omics_to_omics('transcriptomics', 'proteomics')

br_rna = br.get_transcriptomics()
br_pro = br.get_proteomics()
d = br.join_omics_to_omics('transcriptomics', 'proteomics')
d.columns = d.columns.droplevel(1)

lu_rna = lu.get_transcriptomics()
lu_pro = lu.get_proteomics()
e = br.join_omics_to_omics('transcriptomics', 'proteomics')
e.columns = e.columns.droplevel(1)

cc_rna = cc.get_transcriptomics()
cc_pro = cc.get_proteomics()
f = cc.join_omics_to_omics('transcriptomics', 'proteomics')
f.columns = f.columns.droplevel(1)

gb_rna = gb.get_transcriptomics()
gb_pro = gb.get_proteomics()
g = gb.join_omics_to_omics('transcriptomics', 'proteomics')
g.columns = g.columns.droplevel(1)

ls_rna = ls.get_transcriptomics()
ls_pro = ls.get_proteomics()
h = ls.join_omics_to_omics('transcriptomics', 'proteomics')
h.columns = h.columns.droplevel(1)

hn_rna = hn.get_transcriptomics()
hn_pro = hn.get_proteomics()
i = hn.join_omics_to_omics('transcriptomics', 'proteomics')

Checking that hnscc index is up-to-date....   Loading endometrial v2.1.1....Loading endometrial v2.1.1.......Loading endometrial v2.1.1.........Loading endometrial v2.1.1............                                                                    Loading ovarian v0.0.1...Loading ovarian v0.0.1......Loading ovarian v0.0.1........                                                                  Loading colon v0.0.1....Loading colon v0.0.1......Loading colon v0.0.1........Loading colon v0.0.1...........                                                                 Loading brca v5.4..Loading brca v5.4....Loading brca v5.4......                                                                 Loading luad v3.1.1...Loading luad v3.1.1.....Loading luad v3.1.1.......Loading luad v3.1.1.........Loading luad v3.1.1...........                                                                  Loading ccrcc v0.1.1..Loading ccrcc v0.1.1....Loading ccrcc v0.1.1......Loading ccrcc v0.1.1.........



Formatting dataframes.....              Loading hnscc v2.0......Loading hnscc v2.0........                                                   



# Transform

Note: The transcriptomics data are in some sort of log or VST values, but
the proteomics data are standardized protein-wise.

In [5]:
a_std = a.copy()
a_tx_cols = [col for col in a_std.columns if col.endswith('transcriptomics')]
a_std[a_tx_cols] = StandardScaler().fit_transform(a_std[a_tx_cols])
a_std.index = 'EN' + a_std.index

b_std = b.copy()
b_std = b_std.loc[:, ~b_std.columns.duplicated(keep='first')]
b_tx_cols = [col for col in b_std.columns if col.endswith('transcriptomics')]
b_std[b_tx_cols] = StandardScaler().fit_transform(b_std[b_tx_cols])
b_std.index = 'OV' + b_std.index

c_std = c.copy()
c_tx_cols = [col for col in c_std.columns if col.endswith('transcriptomics')]
c_std[c_tx_cols] = StandardScaler().fit_transform(c_std[c_tx_cols])
c_std.index = 'CO' + c_std.index

d_std = d.copy()
d_std = d_std.loc[:, ~d_std.columns.duplicated(keep='first')]
d_tx_cols = [col for col in d_std.columns if col.endswith('transcriptomics')]
d_std[d_tx_cols] = StandardScaler().fit_transform(d_std[d_tx_cols])
d_std.index = 'BR' + d_std.index

e_std = e.copy()
e_std = e_std.loc[:, ~e_std.columns.duplicated(keep='first')]
e_tx_cols = [col for col in e_std.columns if col.endswith('transcriptomics')]
e_std[e_tx_cols] = StandardScaler().fit_transform(e_std[e_tx_cols])
e_std.index = 'LU' + e_std.index

f_std = f.copy()
f_std = f_std.loc[:, ~f_std.columns.duplicated(keep='first')]
f_tx_cols = [col for col in f_std.columns if col.endswith('transcriptomics')]
f_std[f_tx_cols] = StandardScaler().fit_transform(f_std[f_tx_cols])
f_std.index = 'CC' + f_std.index

g_std = g.copy()
g_std = g_std.loc[:, ~g_std.columns.duplicated(keep='first')]
g_tx_cols = [col for col in g_std.columns if col.endswith('transcriptomics')]
g_std[g_tx_cols] = StandardScaler().fit_transform(g_std[g_tx_cols])
g_std.index = 'CC' + g_std.index

h_std = h.copy()
h_std = h_std.loc[:, ~h_std.columns.duplicated(keep='first')]
h_tx_cols = [col for col in h_std.columns if col.endswith('transcriptomics')]
h_std[h_tx_cols] = StandardScaler().fit_transform(h_std[h_tx_cols])
h_std.index = 'LS' + h_std.index

i_std = i.copy()
i_tx_cols = [col for col in i_std.columns if col.endswith('transcriptomics')]
i_std[i_tx_cols] = StandardScaler().fit_transform(i_std[i_tx_cols])
i_std.index = 'HN' + i_std.index


print(True)

True


In [6]:
# Example combining 2 tumors then learn against self using an elastic net
# TODO: Can we speed this up?

z_df_2tumors =  pd.concat([b_std, d_std])
comb_2tumors = train_model.LearnCPTAC(z_df_2tumors)
# self_elastic_result = comb_2tumors.learn_all_proteins(tx_to_include="string",
#                                                      train_method="elastic")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

3000: FHAD1, r: -0.015, R2: -1.233, med.r: 0.339, med.R2: -0.019, med.NRMSE: 0.473


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

4200: KIAA1551, r: 0, R2: -0.077, med.r: 0.348, med.R2: -0.014, med.NRMSE: 0.473


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
 47%|████▋     | 5101/10921 [1:01:03<1:19:12,  1.22it/s]

5100: MSANTD2, r: -0.142, R2: -0.202, med.r: 0.362, med.R2: -0.01, med.NRMSE: 0.476


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

6700: PTPRS, r: 0.162, R2: -0.135, med.r: 0.357, med.R2: -0.013, med.NRMSE: 0.476


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

8000: SPAG7, r: 0.425, R2: -0.079, med.r: 0.36, med.R2: -0.016, med.NRMSE: 0.481


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_desce

9200: USMG5, r: -0.021, R2: -0.32, med.r: 0.351, med.R2: -0.016, med.NRMSE: 0.488


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
 89%|██████

9700: ZNF552, r: 0.893, R2: 0.785, med.r: 0.343, med.R2: -0.021, med.NRMSE: 0.488


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [17]:
# r2_out = [a[2]['r2_test'] for a in self_elastic_result]
# len(r2_out)

661

In [7]:
# self_forest_result = comb_2tumors.learn_all_proteins(tx_to_include="string",
#                                                       train_method="forest")
# print(self_forest_result)


  warn(
  warn(
  warn(
  0%|          | 49/10921 [00:36<2:16:26,  1.33it/s]


KeyboardInterrupt: 