In [9]:
import numpy as np
import cptac
import pandas as pd
from predict_protein import get_proteins, learn_cptac
from sklearn.preprocessing import StandardScaler, RobustScaler

In [10]:
# List current CPTAC datasets
cptac.list_datasets()


Unnamed: 0_level_0,Description,Data reuse status,Publication link
Dataset name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brca,breast cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33212010/
Ccrcc,clear cell renal cell carcinoma (kidney),no restrictions,https://pubmed.ncbi.nlm.nih.gov/31675502/
Colon,colorectal cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/31031003/
Endometrial,endometrial carcinoma (uterine),no restrictions,https://pubmed.ncbi.nlm.nih.gov/32059776/
Gbm,glioblastoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33577785/
Hnscc,head and neck squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/33417831/
Lscc,lung squamous cell carcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/34358469/
Luad,lung adenocarcinoma,no restrictions,https://pubmed.ncbi.nlm.nih.gov/32649874/
Ovarian,high grade serous ovarian cancer,no restrictions,https://pubmed.ncbi.nlm.nih.gov/27372738/
Pdac,pancreatic ductal adenocarcinoma,password access only,unpublished


In [11]:
cptac.download(dataset="endometrial")
cptac.download(dataset="ovarian")
cptac.download(dataset="colon")
cptac.download(dataset="brca")
cptac.download(dataset="luad")
cptac.download(dataset="ccrcc")
cptac.download(dataset="gbm")

Checking that gbm index is up-to-date.....                                                                                                                                                                                                                                                             

True

In [12]:
# Download and format. These are currently done one by one, we may want to turn this into a function

en = cptac.Endometrial()
ov = cptac.Ovarian()
co = cptac.Colon()
br = cptac.Brca()
lu = cptac.Luad()
cc = cptac.Ccrcc()
gb = cptac.Gbm()

# For endometrial, try getting the RNA and protein data
en_rna = en.get_transcriptomics()
en_pro = en.get_proteomics()
a = en.join_omics_to_omics('transcriptomics', 'proteomics')

ov_rna = ov.get_transcriptomics()
ov_pro = ov.get_proteomics()
b = ov.join_omics_to_omics('transcriptomics', 'proteomics')
b.columns = b.columns.droplevel(1)

co_rna = co.get_transcriptomics()
co_pro = co.get_proteomics()
c = co.join_omics_to_omics('transcriptomics', 'proteomics')

br_rna = br.get_transcriptomics()
br_pro = br.get_proteomics()
d = br.join_omics_to_omics('transcriptomics', 'proteomics')
d.columns = d.columns.droplevel(1)

lu_rna = lu.get_transcriptomics()
lu_pro = lu.get_proteomics()
e = br.join_omics_to_omics('transcriptomics', 'proteomics')
e.columns = e.columns.droplevel(1)

cc_rna = cc.get_transcriptomics()
cc_pro = cc.get_proteomics()
f = cc.join_omics_to_omics('transcriptomics', 'proteomics')
f.columns = f.columns.droplevel(1)

gb_rna = gb.get_transcriptomics()
gb_pro = gb.get_proteomics()
g = gb.join_omics_to_omics('transcriptomics', 'proteomics')
g.columns = g.columns.droplevel(1)

Formatting dataframes..........date.....   Loading endometrial v2.1.1....Loading endometrial v2.1.1.......Loading endometrial v2.1.1.........Loading endometrial v2.1.1............                                                                    Loading ovarian v0.0.1...Loading ovarian v0.0.1......Loading ovarian v0.0.1........                                                                  Loading colon v0.0.1....Loading colon v0.0.1......Loading colon v0.0.1........Loading colon v0.0.1...........                                                                 Loading brca v5.4..Loading brca v5.4....Loading brca v5.4......                                                                 Loading luad v3.1.1...Loading luad v3.1.1.....Loading luad v3.1.1.......Loading luad v3.1.1.........Loading luad v3.1.1...........                                                                  Loading ccrcc v0.1.1..Loading ccrcc v0.1.1....Loading ccrcc v0.1.1......Loading ccrcc v0.1.1.........



# Transform

Note: The transcriptomics data are in some sort of log or VST values, but
the proteomics data are standardized protein-wise.

In [13]:
a_std = a.copy()
a_tx_cols = [col for col in a_std.columns if col.endswith('transcriptomics')]
a_std[a_tx_cols] = StandardScaler().fit_transform(a_std[a_tx_cols])
a_std.index = 'EN' + a_std.index

b_std = b.copy()
b_std = b_std.loc[:, ~b_std.columns.duplicated(keep='first')]
b_tx_cols = [col for col in b_std.columns if col.endswith('transcriptomics')]
b_std[b_tx_cols] = StandardScaler().fit_transform(b_std[b_tx_cols])
b_std.index = 'OV' + b_std.index

c_std = c.copy()
c_tx_cols = [col for col in c_std.columns if col.endswith('transcriptomics')]
c_std[c_tx_cols] = StandardScaler().fit_transform(c_std[c_tx_cols])
c_std.index = 'CO' + c_std.index

d_std = d.copy()
d_std = d_std.loc[:, ~d_std.columns.duplicated(keep='first')]
d_tx_cols = [col for col in d_std.columns if col.endswith('transcriptomics')]
d_std[d_tx_cols] = StandardScaler().fit_transform(d_std[d_tx_cols])
d_std.index = 'BR' + d_std.index

e_std = e.copy()
e_std = e_std.loc[:, ~e_std.columns.duplicated(keep='first')]
e_tx_cols = [col for col in e_std.columns if col.endswith('transcriptomics')]
e_std[e_tx_cols] = StandardScaler().fit_transform(e_std[e_tx_cols])
e_std.index = 'LU' + e_std.index

f_std = f.copy()
f_std = f_std.loc[:, ~f_std.columns.duplicated(keep='first')]
f_tx_cols = [col for col in f_std.columns if col.endswith('transcriptomics')]
f_std[f_tx_cols] = StandardScaler().fit_transform(f_std[f_tx_cols])
f_std.index = 'CC' + f_std.index

g_std = g.copy()
g_std = g_std.loc[:, ~g_std.columns.duplicated(keep='first')]
g_tx_cols = [col for col in g_std.columns if col.endswith('transcriptomics')]
g_std[g_tx_cols] = StandardScaler().fit_transform(g_std[g_tx_cols])
g_std.index = 'CC' + g_std.index

print(True)

True


In [None]:
# Example combining 3 tumors then learn against self using an elastic net
# TODO: Can we speed this up with some sort of multithreading or numba or does scikit learn already deal with this?

z_df_3tumors =  pd.concat([a_std, b_std, c_std])
comb_3tumors = learn_cptac.LearnCPTAC(z_df_3tumors)
self_elastic_result = comb_3tumors.learn_all_proteins(tx_to_include="self",
                                                      train_method="elastic")


  0%|          | 1/11924 [00:00<3:09:40,  1.05it/s]

0: A1BG, r: 0.69, R2: 0.305, med.r: 0.69, med.R2: 0.305, med.NRMSE: 0.391


  1%|          | 101/11924 [00:48<1:38:25,  2.00it/s]

100: ACOT13, r: 0.255, R2: 0.034, med.r: 0.452, med.R2: 0.116, med.NRMSE: 0.282


  2%|▏         | 201/11924 [01:35<1:33:33,  2.09it/s]

200: ADH4, r: 0.072, R2: -0.029, med.r: 0.47, med.R2: 0.124, med.NRMSE: 0.29


  3%|▎         | 301/11924 [02:22<1:34:16,  2.05it/s]

300: AKAP9, r: 0.477, R2: 0.226, med.r: 0.469, med.R2: 0.118, med.NRMSE: 0.286


  3%|▎         | 401/11924 [03:10<1:35:56,  2.00it/s]

400: ANAPC4, r: 0.226, R2: 0.04, med.r: 0.463, med.R2: 0.114, med.NRMSE: 0.289


  4%|▍         | 501/11924 [03:58<1:31:21,  2.08it/s]

500: AP4S1, r: 0.109, R2: -0.114, med.r: 0.463, med.R2: 0.107, med.NRMSE: 0.286


  5%|▌         | 601/11924 [04:45<1:21:46,  2.31it/s]

600: ARHGAP32, r: 0.38, R2: 0.053, med.r: 0.451, med.R2: 0.098, med.NRMSE: 0.284


  6%|▌         | 701/11924 [05:33<1:35:23,  1.96it/s]

700: ARX, r: 0.227, R2: -1.684, med.r: 0.445, med.R2: 0.092, med.NRMSE: 0.282


  7%|▋         | 801/11924 [06:23<1:29:38,  2.07it/s]

800: ATP5MF, r: 0, R2: -0.098, med.r: 0.431, med.R2: 0.077, med.NRMSE: 0.284


  8%|▊         | 901/11924 [07:12<1:28:58,  2.06it/s]

900: BARD1, r: 0.666, R2: 0.313, med.r: 0.426, med.R2: 0.074, med.NRMSE: 0.285


  8%|▊         | 1001/11924 [08:00<1:32:43,  1.96it/s]

1000: BMT2, r: -0.314, R2: -1.982, med.r: 0.423, med.R2: 0.077, med.NRMSE: 0.285


  9%|▉         | 1101/11924 [08:47<1:30:21,  2.00it/s]

1100: C12orf43, r: 0.207, R2: -0.006, med.r: 0.416, med.R2: 0.074, med.NRMSE: 0.284


 10%|█         | 1201/11924 [09:34<1:10:35,  2.53it/s]

1200: C5orf15, r: 0.374, R2: -0.083, med.r: 0.413, med.R2: 0.071, med.NRMSE: 0.286


 11%|█         | 1301/11924 [10:20<1:26:43,  2.04it/s]

1300: CANX, r: 0.037, R2: -0.064, med.r: 0.415, med.R2: 0.072, med.NRMSE: 0.288


 12%|█▏        | 1375/11924 [10:56<1:22:33,  2.13it/s]