In [1]:
# 导入必要的库
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import torch
import numpy as np
import random
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer

In [2]:
seed=42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

def drop_and_impute_knn(dataframe, threshold=0.2, n_neighbors=5):
    """
    删除 DataFrame 中除第一列外 0 的数量超过指定比例的特征，并对剩余特征使用 KNN 进行填充。
    
    :param dataframe: pandas DataFrame，要处理的数据。
    :param threshold: float, 0 的数量占比阈值，默认为 20%。
    :param n_neighbors: int, KNN 算法的邻居数量，默认为 5。
    :return: pandas DataFrame, 处理后的 DataFrame。
    """
    # 确认是一个 pandas DataFrame
    if not isinstance(dataframe, pd.DataFrame):
        raise ValueError("数据应该是 pandas DataFrame 类型。")
    
    # 分离第一列和其他列
    first_col = dataframe.iloc[:, 0]
    other_cols = dataframe.iloc[:, 1:]
    
    # 计算每个特征中 0 的占比
    zero_ratio = (other_cols == 0).mean()

    # 找出需要删除的特征
    cols_to_drop = zero_ratio[zero_ratio > threshold].index.tolist()
    
    # 删除超过阈值的特征
    dataframe_remaining = other_cols.drop(columns=cols_to_drop)
    
    # 将剩余的 0 替换为 np.nan，以便后续使用 KNNImputer 填充
    dataframe_remaining = dataframe_remaining.replace(0, np.nan)
    
    # 使用 KNNImputer 对剩余的 NaN 值进行填充
    imputer = KNNImputer(n_neighbors=n_neighbors)
    dataframe_imputed = pd.DataFrame(imputer.fit_transform(dataframe_remaining), columns=dataframe_remaining.columns)
    
    # 将填充后的数据与第一列合并
    result = pd.concat([first_col, dataframe_imputed], axis=1)
    
    return result


In [4]:
Methylation_pathgway = pd.read_csv('../../DATA/BRCA/GO_BRCA_Methylation.csv').sort_values('pvalue')
# 筛选 pvalue 小于 0.05 的行
Methylation_pathgway = Methylation_pathgway[Methylation_pathgway['pvalue'] < 0.05]
Methylation_pathgway

Unnamed: 0.1,Unnamed: 0,ID,Description,GeneRatio,BgRatio,RichFactor,FoldEnrichment,zScore,pvalue,p.adjust,qvalue,geneID,Count
0,GO:0048568,GO:0048568,embryonic organ development,428/14958,455/18888,0.940659,1.187804,7.911061,2.560457e-19,1.627682e-15,9.624621e-16,PROX1/SH2B3/RSPO3/HOXD3/HMX2/SHOX2/GJB6/MAPK1/...,428
1,GO:0060562,GO:0060562,epithelial tube morphogenesis,321/14958,335/18888,0.958209,1.209965,7.564556,1.207893e-18,2.841966e-15,1.680478e-15,SFRP1/PROX1/APAF1/COL4A1/WNT6/CSMD1/MTSS1/RNF2...,321
2,GO:0048880,GO:0048880,sensory system development,381/14958,403/18888,0.945409,1.193802,7.672263,1.765703e-18,2.841966e-15,1.680478e-15,NPHP4/PROX1/ATOH7/TTC8/COL4A1/BARHL2/WNT6/NRL/...,381
3,GO:0009410,GO:0009410,response to xenobiotic stimulus,417/14958,444/18888,0.939189,1.185948,7.735291,1.788243e-18,2.841966e-15,1.680478e-15,SFRP1/PTK2B/FECH/THBS1/EHMT2/ABCC1/TP53I13/TGI...,417
4,GO:0150063,GO:0150063,visual system development,375/14958,397/18888,0.944584,1.192760,7.572771,5.252381e-18,6.677877e-15,3.948685e-15,NPHP4/PROX1/ATOH7/TTC8/COL4A1/BARHL2/WNT6/NRL/...,375
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2781,GO:0060038,GO:0060038,cardiac muscle cell proliferation,53/14958,60/18888,0.883333,1.115416,1.746881,4.959483e-02,1.133265e-01,6.701091e-02,YAP1/HEY2/JARID2/ZFPM2/NOTCH1/GATA6/MAPK11/ABL...,53
2782,GO:0032507,GO:0032507,maintenance of protein location in cell,58/14958,66/18888,0.878788,1.109677,1.741309,4.976146e-02,1.135440e-01,6.713953e-02,RER1/ANK3/TAF8/SYNE1/PARK7/VPS13D/OS9/TOPORS/T...,58
2783,GO:0050771,GO:0050771,negative regulation of axonogenesis,58/14958,66/18888,0.878788,1.109677,1.741309,4.976146e-02,1.135440e-01,6.713953e-02,DAB1/DCC/SEMA6B/EPHA7/MAP2/WNT3A/SPP1/SEMA5B/S...,58
2784,GO:0050994,GO:0050994,regulation of lipid catabolic process,58/14958,66/18888,0.878788,1.109677,1.741309,4.976146e-02,1.135440e-01,6.713953e-02,THRA/ALK/DAGLB/ADORA1/IRS2/TWIST1/SCT/ABCB11/P...,58


In [5]:
# 读取制表符分隔的TSV文件
Methylation = pd.read_csv('../../DATA/BRCA/Human__TCGA_BRCA__JHU_USC__Methylation__Meth450__01_28_2016__BI__Gene__Firehose_Methylation_Prepocessor_common_samples.tsv', sep='\t')
Methylation = drop_and_impute_knn(Methylation)
# 显示 DataFrame
Methylation

Unnamed: 0.1,Unnamed: 0,RBL2,VDAC3,ACTN1,ATP2A1,SFRP1,NIPA2,MAN1B1,TSEN34,LRRC16A,...,BAGE5,BAGE,BAGE2,BAGE4,BAGE3,MIR637,LOC100130932,GOLGA8F,GOLGA8G,MIR7-1
0,TCGA.D8.A1JH,-0.4509,0.4213,0.0930,0.3937,-0.4201,-0.4868,0.3541,-0.4127,-0.4370,...,0.2392,0.2392,0.2392,0.2392,0.2392,0.4185,0.3816,0.0083,0.0083,-0.31684
1,TCGA.OL.A5RW,-0.4254,-0.4610,0.2416,0.3397,-0.4719,-0.4823,0.4368,-0.4491,-0.4721,...,0.0235,0.0235,0.0235,0.0235,0.0235,0.3673,0.4267,0.0293,0.0293,-0.28100
2,TCGA.XX.A899,-0.4500,-0.4721,0.2597,0.4152,0.0244,-0.4843,0.4211,-0.4391,-0.4622,...,0.1998,0.1998,0.1998,0.1998,0.1998,0.3908,0.4161,-0.0702,-0.0702,-0.25980
3,TCGA.S3.A6ZH,-0.4691,-0.4842,0.3224,0.3773,0.0580,-0.4843,0.4313,-0.4730,-0.4724,...,-0.1330,-0.1330,-0.1330,-0.1330,-0.1330,0.3684,0.3430,-0.2010,-0.2010,-0.20920
4,TCGA.AQ.A54N,-0.4274,-0.4688,0.3504,0.4143,-0.4804,-0.4821,0.4365,-0.4529,-0.4526,...,0.0714,0.0714,0.0714,0.0714,0.0714,0.4140,0.3309,0.0635,0.0635,-0.32324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,TCGA.GM.A5PX,-0.4509,-0.4651,0.1044,0.3838,-0.2495,-0.4849,0.4210,-0.4537,-0.4714,...,0.0580,0.0580,0.0580,0.0580,0.0580,0.3742,0.3665,-0.1194,-0.1194,-0.31230
766,TCGA.AC.A5XU,-0.4519,-0.4496,0.2791,0.3522,0.0507,-0.4827,0.3946,-0.4647,-0.4718,...,-0.1007,-0.1007,-0.1007,-0.1007,-0.1007,0.3618,0.4120,-0.2217,-0.2217,-0.36300
767,TCGA.AR.A1AN,-0.4365,-0.4557,0.2503,0.3619,-0.3741,-0.4859,0.4141,-0.4222,-0.4679,...,0.1202,0.1202,0.1202,0.1202,0.1202,0.3812,0.4325,-0.0542,-0.0542,-0.35290
768,TCGA.E9.A1RG,-0.4411,-0.4636,0.3023,0.3655,-0.4125,-0.4859,0.4484,-0.4481,-0.4766,...,-0.1603,-0.1603,-0.1603,-0.1603,-0.1603,0.4360,0.4096,-0.1610,-0.1610,-0.35610


In [6]:
# 提取 geneID 列中每一行的基因ID，并存储为列表
gene_ids = Methylation_pathgway['geneID'].apply(lambda x: x.split('/'))
gene_ids

0       [PROX1, SH2B3, RSPO3, HOXD3, HMX2, SHOX2, GJB6...
1       [SFRP1, PROX1, APAF1, COL4A1, WNT6, CSMD1, MTS...
2       [NPHP4, PROX1, ATOH7, TTC8, COL4A1, BARHL2, WN...
3       [SFRP1, PTK2B, FECH, THBS1, EHMT2, ABCC1, TP53...
4       [NPHP4, PROX1, ATOH7, TTC8, COL4A1, BARHL2, WN...
                              ...                        
2781    [YAP1, HEY2, JARID2, ZFPM2, NOTCH1, GATA6, MAP...
2782    [RER1, ANK3, TAF8, SYNE1, PARK7, VPS13D, OS9, ...
2783    [DAB1, DCC, SEMA6B, EPHA7, MAP2, WNT3A, SPP1, ...
2784    [THRA, ALK, DAGLB, ADORA1, IRS2, TWIST1, SCT, ...
2785    [TRIM56, IL12B, SASH1, NMI, RC3H2, RC3H1, TRIM...
Name: geneID, Length: 2786, dtype: object

In [7]:
# # 创建一个空的数据框来存储所有PCA结果
# BRCA_Methylation = pd.DataFrame()

# # 设置累计解释方差的阈值
# explained_variance_threshold = 0.95

# # 对每一行的基因数据进行PCA
# for i, gene_list in enumerate(gene_ids):
#     # 找到Methylation数据框中与gene_list对应的列
#     matching_columns = [col for col in Methylation.columns if col in gene_list]
#     # 提取这些列的数据
#     data = Methylation[matching_columns]
    
#     # 确保数据足够进行PCA（至少需要两个基因）
#     if len(matching_columns) > 1:
#         # 初始化PCA
#         pca = PCA()
#         pca.fit(data)
        
#         # 计算累计解释方差
#         cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()
        
#         # 确定达到阈值的主成分数k
#         k = next(i for i, total_variance in enumerate(cumulative_explained_variance) if total_variance >= explained_variance_threshold) + 1
        
#         # 重新运行PCA，保留前k个主成分
#         pca = PCA(n_components=k)
#         pca_result = pca.fit_transform(data)
        
#         # 将PCA结果转换为DataFrame，并添加索引信息
#         pca_df = pd.DataFrame(pca_result, columns=[f'PC{j+1}_row_{i}' for j in range(k)])
        
#         # 合并PCA结果到总数据框
#         BRCA_Methylation = pd.concat([BRCA_Methylation, pca_df], axis=1)
#     else:
#         print(f"Row {i} does not have enough genes for PCA.")

# # 显示最终合并的PCA结果
# BRCA_Methylation

In [8]:
# # 创建一个空的数据框来存储所有PCA结果
# BRCA_Methylation = pd.DataFrame()

# # 对每一行的基因数据进行PCA
# for i, gene_list in enumerate(gene_ids):
#     # 找到Methylation数据框中与gene_list对应的列
#     matching_columns = [col for col in Methylation.columns if col in gene_list]
#     # 提取这些列的数据
#     data = Methylation[matching_columns]
    
#     # 确保数据足够进行PCA（至少需要两个基因）
#     if len(matching_columns) > 1:
#         # 初始化PCA，并指定n_components为1
#         pca = PCA(n_components=1)
#         pca_result = pca.fit_transform(data)
        
#         # 将PCA结果转换为DataFrame，并添加索引信息
#         pca_df = pd.DataFrame(pca_result, columns=[f'PC1_row_{i}'])
        
#         # 合并PCA结果到总数据框
#         BRCA_Methylation = pd.concat([BRCA_Methylation, pca_df], axis=1)
#     else:
#         print(f"Row {i} does not have enough genes for PCA.")

# # 显示最终合并的PCA结果
# BRCA_Methylation


In [9]:
# 创建一个空的数据框来存储所有结果
BRCA_Methylation = pd.DataFrame()

# 对每一行的基因数据进行拼接
for i, gene_list in enumerate(gene_ids):
    # 找到Methylation数据框中与gene_list对应的列
    matching_columns = [col for col in Methylation.columns if col in gene_list]
    
    # 去重，保留唯一的基因列
    unique_columns = list(dict.fromkeys(matching_columns))  # 保留顺序的去重方法
    
    # 提取这些唯一列的数据
    data = Methylation[unique_columns]
    
    # 确保有匹配到的基因列
    if len(unique_columns) > 0:
        # 检查是否有重复的列
        unique_columns_to_add = [col for col in unique_columns if col not in BRCA_Methylation.columns]
        
        # 如果没有重复列，将新列的数据直接拼接
        if unique_columns_to_add:
            data_to_add = data[unique_columns_to_add]
            BRCA_Methylation = pd.concat([BRCA_Methylation, data_to_add], axis=1)
        else:
            print(f"Row {i} does not have new unique columns to add.")
    else:
        print(f"Row {i} does not have matching genes to concatenate.")

# 显示最终拼接的结果
BRCA_Methylation


Row 4 does not have new unique columns to add.
Row 5 does not have new unique columns to add.
Row 7 does not have new unique columns to add.
Row 10 does not have new unique columns to add.
Row 14 does not have new unique columns to add.
Row 16 does not have new unique columns to add.
Row 26 does not have new unique columns to add.
Row 35 does not have new unique columns to add.
Row 40 does not have new unique columns to add.
Row 43 does not have new unique columns to add.
Row 47 does not have new unique columns to add.
Row 50 does not have new unique columns to add.
Row 51 does not have new unique columns to add.
Row 62 does not have new unique columns to add.
Row 64 does not have new unique columns to add.
Row 65 does not have new unique columns to add.
Row 67 does not have new unique columns to add.
Row 68 does not have new unique columns to add.
Row 69 does not have new unique columns to add.
Row 70 does not have new unique columns to add.
Row 73 does not have new unique columns to 

Unnamed: 0,PROX1,SH2B3,RSPO3,HOXD3,HMX2,SHOX2,GJB6,MAPK1,PDGFRA,TEAD3,...,NOP56,PIH1D2,EXOSC1,REXO4,TRMT112,PRTFDC1,PRDM2,MTFMT,EIF1AD,FCGRT
0,-0.3947,0.3193,0.1971,-0.0426,-0.1288,-0.1590,-0.2084,-0.4558,-0.2251,-0.2197,...,-0.0756,-0.4602,-0.4225,-0.0336,-0.3139,-0.4096,-0.4102,-0.4736,-0.4522,-0.1503
1,-0.4754,0.3408,0.0615,0.1237,-0.4644,-0.0780,-0.4734,-0.4738,-0.3987,-0.3110,...,-0.3802,-0.4293,-0.4798,-0.0455,-0.3757,-0.4630,-0.4789,-0.4707,-0.4813,-0.3234
2,-0.0863,0.3551,0.1403,0.1394,-0.2340,-0.0532,-0.1977,-0.4763,-0.0906,-0.2305,...,-0.3447,-0.3862,-0.4814,0.0116,-0.3964,-0.3886,-0.4801,-0.4754,-0.4828,-0.0965
3,-0.2583,0.3964,0.0686,0.3516,-0.4030,0.0850,-0.3241,-0.4781,0.1752,-0.3810,...,-0.4279,-0.3667,-0.4777,0.1632,-0.4000,-0.4271,-0.4775,-0.4725,-0.4843,-0.3937
4,-0.4699,0.3242,-0.2212,0.3626,-0.4097,0.3497,-0.4797,-0.4764,-0.3897,-0.3840,...,-0.3979,-0.4168,-0.4850,0.0333,-0.3639,-0.4688,-0.4766,-0.4649,-0.4830,-0.3935
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,-0.2350,0.3391,0.1727,0.2844,-0.0159,0.0174,0.0221,-0.4761,-0.1010,-0.2657,...,-0.1847,-0.4424,-0.4846,-0.0152,-0.3879,0.0175,-0.4792,-0.4734,-0.4823,-0.0147
766,-0.0497,0.3121,0.3168,0.3675,0.0914,0.1603,-0.0460,-0.4770,0.0527,-0.2806,...,-0.4468,-0.4221,-0.4839,-0.0538,-0.3578,-0.4032,-0.4811,-0.4659,-0.4825,0.1493
767,0.0042,0.3174,0.2500,0.3374,-0.4054,0.0424,-0.4743,-0.4672,-0.0599,-0.3227,...,-0.3062,-0.4441,-0.4820,-0.0172,-0.3558,-0.0246,-0.4811,-0.4745,-0.4823,0.0561
768,-0.2187,0.3242,0.3495,0.3104,-0.2608,-0.3840,-0.4116,-0.4801,0.0197,-0.3545,...,0.0776,-0.4326,-0.4855,0.0375,-0.3625,-0.0044,-0.4801,-0.4746,-0.4842,-0.2489


In [10]:
# 获取所有列名
columns = BRCA_Methylation.columns

# 检查是否有重复的列名
duplicate_columns = columns[columns.duplicated()]

# 输出重复列名
if len(duplicate_columns) > 0:
    print("Duplicate column names found:", duplicate_columns.tolist())
else:
    print("No duplicate column names found.")


No duplicate column names found.


In [11]:
BRCA_Methylation.to_csv('../../DATA/BRCA/BRCA_Methylation.csv', index=False)

In [12]:
SCNA_pathgway = pd.read_csv('../../DATA/BRCA/GO_BRCA_SCNA.csv').sort_values('pvalue')
# 筛选 pvalue 小于 0.05 的行
SCNA_pathgway = SCNA_pathgway[SCNA_pathgway['pvalue'] < 0.05]
SCNA_pathgway

Unnamed: 0.1,Unnamed: 0,ID,Description,GeneRatio,BgRatio,RichFactor,FoldEnrichment,zScore,pvalue,p.adjust,qvalue,geneID,Count
0,GO:1903131,GO:1903131,mononuclear cell differentiation,473/16655,484/18888,0.977273,1.108299,6.591773,4.752687e-15,3.021283e-11,2.151216e-11,PRKCZ/RPL22/TNFRSF9/PIK3CD/CTNNBIP1/MTOR/PLA2G...,473
1,GO:0099177,GO:0099177,regulation of trans-synaptic signaling,481/16655,494/18888,0.973684,1.104230,6.411014,4.791826e-14,1.038009e-10,7.390838e-11,DVL1/PRKCZ/TPRG1L/CLSTN1/FBXO2/PINK1/EPHB2/CNR...,481
2,GO:0050804,GO:0050804,modulation of chemical synaptic transmission,480/16655,493/18888,0.973631,1.104169,6.400628,5.308743e-14,1.038009e-10,7.390838e-11,DVL1/PRKCZ/TPRG1L/CLSTN1/FBXO2/PINK1/EPHB2/CNR...,480
3,GO:0060562,GO:0060562,epithelial tube morphogenesis,331/16655,335/18888,0.988060,1.120533,6.078964,6.531438e-14,1.038009e-10,7.390838e-11,DVL1/HES5/SKI/RNF207/PIK3CD/CTNNBIP1/MTHFR/EPH...,331
4,GO:0048880,GO:0048880,sensory system development,395/16655,403/18888,0.980149,1.111561,6.182520,1.252380e-13,1.334611e-10,9.502704e-11,GNB1/HES5/SAMD11/SKI/NPHP4/ANGPTL7/MFN2/EPHA2/...,395
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2049,GO:0006906,GO:0006906,vesicle fusion,111/16655,119/18888,0.932773,1.057834,1.728393,4.885606e-02,1.510779e-01,1.075706e-01,VAMP3/ATP13A2/PLA2G5/STX12/VAV3/SEC22B/SNAPIN/...,111
2050,GO:0031644,GO:0031644,regulation of nervous system process,111/16655,119/18888,0.932773,1.057834,1.728393,4.885606e-02,1.510779e-01,1.075706e-01,DVL1/PRKCZ/TNFRSF1B/NCMAP/DLGAP3/RGS4/TNR/ADOR...,111
2055,GO:0060390,GO:0060390,regulation of SMAD protein signal transduction,71/16655,75/18888,0.946667,1.073590,1.743931,4.886207e-02,1.510779e-01,1.075706e-01,SKI/TGFBR3/TGFB2/PARP1/GDF7/EMILIN1/SPTBN1/BMP...,71
2054,GO:0045123,GO:0045123,cellular extravasation,71/16655,75/18888,0.946667,1.073590,1.743931,4.886207e-02,1.510779e-01,1.075706e-01,PIK3CD/PTAFR/VCAM1/F11R/SELP/SELL/SELE/ADD2/IL...,71


In [13]:
# 读取制表符分隔的TSV文件
SCNA = pd.read_csv('../../DATA/BRCA/Human__TCGA_BRCA__BI__SCNA__SNP_6.0__01_28_2016__BI__Gene__Firehose_GISTIC2_common_samples.tsv', sep='\t')
SCNA = drop_and_impute_knn(SCNA)
# 显示 DataFrame
SCNA

Unnamed: 0.1,Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,...,SMIM9,SNORA36A,SNORA56,TMLHE,VBP1,IL9R|ENSG00000124334.12,SPRY3|ENSG00000168939.6,VAMP7|ENSG00000124333.10,WASH6P|ENSG00000182484.10,WASIR1|ENSG00000185203.7
0,TCGA.D8.A1JH,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,...,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030,0.030
1,TCGA.OL.A5RW,0.304,0.304,0.304,0.304,0.304,0.304,0.304,0.304,0.304,...,0.670,0.670,0.670,0.670,0.670,0.670,0.670,0.670,0.670,0.670
2,TCGA.XX.A899,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
3,TCGA.S3.A6ZH,-0.423,-0.423,-0.423,-0.423,-0.423,-0.423,-0.423,-0.423,-0.423,...,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150
4,TCGA.AQ.A54N,0.100,0.100,0.100,0.100,0.100,0.100,0.100,0.100,0.100,...,-0.462,-0.462,-0.462,-0.462,-0.462,-0.462,-0.462,-0.462,-0.462,-0.462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,TCGA.GM.A5PX,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
766,TCGA.AC.A5XU,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,...,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022,0.022
767,TCGA.AR.A1AN,0.053,0.053,0.053,0.053,0.053,0.053,0.053,0.053,0.053,...,-0.060,-0.060,-0.060,-0.060,-0.060,-0.060,-0.060,-0.060,-0.060,-0.060
768,TCGA.E9.A1RG,-0.373,-0.373,-0.373,-0.373,-0.373,-0.373,-0.373,-0.373,-0.373,...,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150,-0.150


In [14]:
# 提取 geneID 列中每一行的基因ID，并存储为列表
gene_ids = SCNA_pathgway['geneID'].apply(lambda x: x.split('/'))
gene_ids

0       [PRKCZ, RPL22, TNFRSF9, PIK3CD, CTNNBIP1, MTOR...
1       [DVL1, PRKCZ, TPRG1L, CLSTN1, FBXO2, PINK1, EP...
2       [DVL1, PRKCZ, TPRG1L, CLSTN1, FBXO2, PINK1, EP...
3       [DVL1, HES5, SKI, RNF207, PIK3CD, CTNNBIP1, MT...
4       [GNB1, HES5, SAMD11, SKI, NPHP4, ANGPTL7, MFN2...
                              ...                        
2049    [VAMP3, ATP13A2, PLA2G5, STX12, VAV3, SEC22B, ...
2050    [DVL1, PRKCZ, TNFRSF1B, NCMAP, DLGAP3, RGS4, T...
2055    [SKI, TGFBR3, TGFB2, PARP1, GDF7, EMILIN1, SPT...
2054    [PIK3CD, PTAFR, VCAM1, F11R, SELP, SELL, SELE,...
2056    [CHD5, MTOR, KDM1A, RPL11, ZMPSTE24, PLK3, MIR...
Name: geneID, Length: 2057, dtype: object

In [15]:
# # 创建一个空的数据框来存储所有PCA结果
# BRCA_SCNA = pd.DataFrame()

# # 设置累计解释方差的阈值
# explained_variance_threshold = 0.95

# # 对每一行的基因数据进行PCA
# for i, gene_list in enumerate(gene_ids):
#     # 找到SCNA数据框中与gene_list对应的列
#     matching_columns = [col for col in SCNA.columns if col in gene_list]
#     # 提取这些列的数据
#     data = SCNA[matching_columns]
    
#     # 确保数据足够进行PCA（至少需要两个基因）
#     if len(matching_columns) > 1:
#         # 初始化PCA
#         pca = PCA()
#         pca.fit(data)
        
#         # 计算累计解释方差
#         cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()
        
#         # 确定达到阈值的主成分数k
#         k = next(i for i, total_variance in enumerate(cumulative_explained_variance) if total_variance >= explained_variance_threshold) + 1
        
#         # 重新运行PCA，保留前k个主成分
#         pca = PCA(n_components=k)
#         pca_result = pca.fit_transform(data)
        
#         # 将PCA结果转换为DataFrame，并添加索引信息
#         pca_df = pd.DataFrame(pca_result, columns=[f'PC{j+1}_row_{i}' for j in range(k)])
        
#         # 合并PCA结果到总数据框
#         BRCA_SCNA = pd.concat([BRCA_SCNA, pca_df], axis=1)
#     else:
#         print(f"Row {i} does not have enough genes for PCA.")

# # 显示最终合并的PCA结果
# BRCA_SCNA

In [16]:
# # 创建一个空的数据框来存储所有PCA结果
# BRCA_SCNA = pd.DataFrame()

# # 对每一行的基因数据进行PCA
# for i, gene_list in enumerate(gene_ids):
#     # 找到Methylation数据框中与gene_list对应的列
#     matching_columns = [col for col in SCNA.columns if col in gene_list]
#     # 提取这些列的数据
#     data = SCNA[matching_columns]
    
#     # 确保数据足够进行PCA（至少需要两个基因）
#     if len(matching_columns) > 1:
#         # 初始化PCA，并指定n_components为1
#         pca = PCA(n_components=1)
#         pca_result = pca.fit_transform(data)
        
#         # 将PCA结果转换为DataFrame，并添加索引信息
#         pca_df = pd.DataFrame(pca_result, columns=[f'PC1_row_{i}'])
        
#         # 合并PCA结果到总数据框
#         BRCA_SCNA = pd.concat([BRCA_SCNA, pca_df], axis=1)
#     else:
#         print(f"Row {i} does not have enough genes for PCA.")

# # 显示最终合并的PCA结果
# BRCA_SCNA


In [17]:
# 创建一个空的数据框来存储所有结果
BRCA_SCNA = pd.DataFrame()

# 对每一行的基因数据进行拼接
for i, gene_list in enumerate(gene_ids):
    # 找到SCNA数据框中与gene_list对应的列
    matching_columns = [col for col in SCNA.columns if col in gene_list]
    
    # 去重，保留唯一的基因列
    unique_columns = list(dict.fromkeys(matching_columns))  # 保留顺序的去重方法
    
    # 提取这些唯一列的数据
    data = SCNA[unique_columns]
    
    # 确保有匹配到的基因列
    if len(unique_columns) > 0:
        # 检查是否有重复的列
        unique_columns_to_add = [col for col in unique_columns if col not in BRCA_SCNA.columns]
        
        # 如果没有重复列，将新列的数据直接拼接
        if unique_columns_to_add:
            data_to_add = data[unique_columns_to_add]
            BRCA_SCNA = pd.concat([BRCA_SCNA, data_to_add], axis=1)
        else:
            print(f"Row {i} does not have new unique columns to add.")
    else:
        print(f"Row {i} does not have matching genes to concatenate.")

# 显示最终拼接的结果
BRCA_SCNA


Row 2 does not have new unique columns to add.
Row 7 does not have new unique columns to add.
Row 8 does not have new unique columns to add.
Row 9 does not have new unique columns to add.
Row 16 does not have new unique columns to add.
Row 20 does not have new unique columns to add.
Row 25 does not have new unique columns to add.
Row 28 does not have new unique columns to add.
Row 29 does not have new unique columns to add.
Row 41 does not have new unique columns to add.
Row 45 does not have new unique columns to add.
Row 52 does not have new unique columns to add.
Row 54 does not have new unique columns to add.
Row 61 does not have new unique columns to add.
Row 64 does not have new unique columns to add.
Row 66 does not have new unique columns to add.
Row 71 does not have new unique columns to add.
Row 75 does not have new unique columns to add.
Row 76 does not have new unique columns to add.
Row 77 does not have new unique columns to add.
Row 80 does not have new unique columns to a

Unnamed: 0,PRKCZ,RPL22,TNFRSF9,PIK3CD,CTNNBIP1,MTOR,PLA2G2D,WNT4,RUNX3,ZNF683,...,TOPAZ1,AKR7A3,HYI,GRHPR,MSTO1,ARMC1,CLUH,GPR101,SMR3A,SMR3B
0,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,-0.001,...,-0.018,-0.001,-0.001,-0.008,-0.001,0.1920,0.002,0.0300,0.1170,0.1170
1,0.304,0.363,0.333,0.338,0.338,0.338,0.338,0.338,0.262,0.262,...,-0.629,0.338,0.310,0.258,1.207,-0.0180,-0.709,0.7000,-0.7490,-0.7490
2,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,-0.071,...,0.001,-0.071,-0.071,-0.069,0.275,0.1520,0.008,0.0010,-0.0062,-0.0062
3,-0.423,-0.423,-0.423,-0.423,-0.423,-0.423,-0.427,-0.427,-0.387,-0.387,...,-0.753,-0.427,-0.387,-0.312,1.199,0.1170,-0.837,-0.1500,-0.2660,-0.2660
4,0.100,0.100,0.100,0.100,0.100,0.100,0.108,0.083,0.709,0.033,...,-0.604,0.108,0.065,-0.593,1.086,0.6550,0.630,0.2420,0.0330,0.0330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,-0.013,...,0.004,-0.013,-0.016,-0.002,1.034,0.0074,-0.020,0.0010,0.0040,0.0040
766,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,-0.525,...,0.255,-0.525,-0.525,-0.526,0.638,-0.5190,-0.257,0.0358,-0.0050,-0.0050
767,0.053,0.053,0.053,0.053,0.053,0.053,0.053,0.053,0.053,0.053,...,0.012,0.053,0.019,0.011,0.441,-0.0100,-0.449,-0.0600,1.2350,1.2350
768,-0.373,-0.373,-0.373,-0.373,-0.373,-0.373,-0.373,-0.365,-0.365,-0.365,...,-0.385,-0.373,-0.370,-0.384,0.502,0.5370,-0.372,0.3820,0.0660,0.0660


In [18]:
BRCA_SCNA.to_csv('../../DATA/BRCA/BRCA_SCNA.csv', index=False)

In [19]:
RNAseq_pathgway = pd.read_csv('../../DATA/BRCA/GO_BRCA_RNAseq.csv').sort_values('pvalue')
# 筛选 pvalue 小于 0.05 的行
RNAseq_pathgway = RNAseq_pathgway[RNAseq_pathgway['pvalue'] < 0.05]
RNAseq_pathgway

Unnamed: 0.1,Unnamed: 0,ID,Description,GeneRatio,BgRatio,RichFactor,FoldEnrichment,zScore,pvalue,p.adjust,qvalue,geneID,Count
0,GO:0048732,GO:0048732,gland development,433/15221,449/18888,0.964365,1.196697,8.594106,5.119744e-24,3.252573e-20,1.866820e-20,ABL1/ACADM/ACAT1/ACER1/ACO2/ADA/AIRE/AKT1/AKT2...,433
1,GO:0048568,GO:0048568,embryonic organ development,438/15221,455/18888,0.962637,1.194553,8.558423,1.087599e-23,3.454758e-20,1.982865e-20,A2M/ACVR1/ADA/ADM/AHI1/AKT1/ALDH1A2/ALDH1A3/AL...,438
2,GO:0007264,GO:0007264,small GTPase-mediated signal transduction,476/15221,500/18888,0.952000,1.181353,8.373180,5.390264e-22,1.141478e-18,6.551534e-19,ABCA1/ABI2/ABL1/ABL2/ABRA/ABR/ADCYAP1R1/ADRA2A...,476
3,GO:0060562,GO:0060562,epithelial tube morphogenesis,327/15221,335/18888,0.976119,1.211283,7.949287,1.030394e-21,1.636523e-18,9.392852e-19,ABL1/ACVR1/ACVRL1/ADAMTS12/ADAMTS16/ADM/AGTR2/...,327
4,GO:1903131,GO:1903131,mononuclear cell differentiation,461/15221,484/18888,0.952479,1.181948,8.261547,1.741060e-21,2.212190e-18,1.269691e-18,ABL1/ACIN1/ACTB/ACTL6A/ACTL6B/ADAM17/ADAM8/ADA...,461
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2838,GO:0032908,GO:0032908,regulation of transforming growth factor beta1...,14/15221,14/18888,1.000000,1.240917,1.837162,4.865154e-02,1.070975e-01,6.146878e-02,ADAM8/AGT/ATP6AP2/CD2AP/CX3CL1/FOXP3/FURIN/GAT...,14
2844,GO:0036151,GO:0036151,phosphatidylcholine acyl-chain remodeling,14/15221,14/18888,1.000000,1.240917,1.837162,4.865154e-02,1.070975e-01,6.146878e-02,DBI/LPCAT1/LPCAT2/LPCAT3/LPCAT4/MBOAT2/MBOAT7/...,14
2886,GO:0120032,GO:0120032,regulation of plasma membrane bounded cell pro...,174/15221,204/18888,0.852941,1.058429,1.709459,4.907938e-02,1.080018e-01,6.198785e-02,ABI2/ABI3/ACTR2/ACTR3/ADAMTS16/AGRN/AKIRIN1/AK...,174
2887,GO:0090090,GO:0090090,negative regulation of canonical Wnt signaling...,125/15221,145/18888,0.862069,1.069756,1.717887,4.918841e-02,1.082043e-01,6.210404e-02,AMFR/ANKRD6/APC2/APC/APOE/AXIN1/AXIN2/BICC1/BM...,125


In [20]:
# 读取制表符分隔的TSV文件
RNAseq = pd.read_csv('../../DATA/BRCA/Human__TCGA_BRCA__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2_common_samples.tsv', sep='\t')
RNAseq = drop_and_impute_knn(RNAseq)
# 显示 DataFrame
RNAseq

Unnamed: 0.1,Unnamed: 0,A1BG,A2LD1,A2ML1,A2M,A4GALT,AAAS,AACS,AADAT,AAGAB,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,psiTPTE22
0,TCGA.D8.A1JH,6.6980,6.3376,2.60830,14.2918,8.0993,9.3871,10.0371,5.6856,10.7372,...,8.4062,6.5275,10.0332,10.2976,5.1870,10.1598,11.5054,10.8429,10.3800,8.4436
1,TCGA.OL.A5RW,7.7634,5.8225,11.03610,11.5957,7.2305,9.4180,9.4474,7.0779,10.0665,...,9.4494,4.8136,7.7614,10.4659,3.1757,7.8520,11.5437,8.7513,8.4453,8.1613
2,TCGA.XX.A899,7.8088,7.0469,0.47890,14.3127,8.9845,9.4678,10.1770,5.4173,10.5380,...,8.8395,6.1672,9.3325,10.3078,4.4764,9.4288,12.3217,10.9180,9.3903,6.6355
3,TCGA.S3.A6ZH,7.7030,6.6799,1.53090,12.6917,7.5527,9.5746,10.2573,5.0770,11.5846,...,10.6381,5.7270,8.8121,10.3413,7.1729,9.4000,11.3007,9.3480,9.4683,4.4660
4,TCGA.AQ.A54N,8.7511,5.4406,4.51570,12.1222,5.2704,10.3992,8.9339,8.6147,8.6569,...,9.7255,4.2822,7.5725,10.3845,7.5569,8.9157,12.8822,10.0081,9.1350,8.3291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,TCGA.GM.A5PX,7.7700,7.1346,0.47590,13.7090,8.7206,9.8015,9.4073,5.5217,10.5575,...,7.5973,4.9029,8.7059,9.7945,6.4489,9.4401,11.6096,9.2053,9.6349,7.2050
766,TCGA.AC.A5XU,9.2189,6.9065,0.47380,13.6766,8.8052,10.2893,10.2266,5.4389,10.4232,...,9.3056,6.8925,10.1809,9.8913,3.3689,8.8202,12.4825,10.7307,9.3012,5.0047
767,TCGA.AR.A1AN,6.5663,6.7836,1.96188,13.6565,8.1580,9.2231,10.9819,5.7909,10.5450,...,8.8524,5.7450,9.0406,10.1056,7.6446,10.0345,11.8261,10.2996,10.2447,6.0082
768,TCGA.E9.A1RG,8.8370,5.6676,2.60480,12.4005,9.8105,10.3453,10.0707,4.5913,10.9778,...,9.8428,4.8443,7.9174,9.7010,5.3647,8.6875,11.9177,9.7606,7.9393,4.6720


In [21]:
# 提取 geneID 列中每一行的基因ID，并存储为列表
gene_ids = RNAseq_pathgway['geneID'].apply(lambda x: x.split('/'))
gene_ids

0       [ABL1, ACADM, ACAT1, ACER1, ACO2, ADA, AIRE, A...
1       [A2M, ACVR1, ADA, ADM, AHI1, AKT1, ALDH1A2, AL...
2       [ABCA1, ABI2, ABL1, ABL2, ABRA, ABR, ADCYAP1R1...
3       [ABL1, ACVR1, ACVRL1, ADAMTS12, ADAMTS16, ADM,...
4       [ABL1, ACIN1, ACTB, ACTL6A, ACTL6B, ADAM17, AD...
                              ...                        
2838    [ADAM8, AGT, ATP6AP2, CD2AP, CX3CL1, FOXP3, FU...
2844    [DBI, LPCAT1, LPCAT2, LPCAT3, LPCAT4, MBOAT2, ...
2886    [ABI2, ABI3, ACTR2, ACTR3, ADAMTS16, AGRN, AKI...
2887    [AMFR, ANKRD6, APC2, APC, APOE, AXIN1, AXIN2, ...
2888    [ACTN2, ADD1, ADD2, ADD3, AVIL, CAPG, CAPZA1, ...
Name: geneID, Length: 2889, dtype: object

In [22]:
# # 创建一个空的数据框来存储所有PCA结果
# BRCA_RNAseq = pd.DataFrame()

# # 设置累计解释方差的阈值
# explained_variance_threshold = 0.95

# # 对每一行的基因数据进行PCA
# for i, gene_list in enumerate(gene_ids):
#     # 找到RNAseq数据框中与gene_list对应的列
#     matching_columns = [col for col in RNAseq.columns if col in gene_list]
#     # 提取这些列的数据
#     data = RNAseq[matching_columns]
    
#     # 确保数据足够进行PCA（至少需要两个基因）
#     if len(matching_columns) > 1:
#         # 初始化PCA
#         pca = PCA()
#         pca.fit(data)
        
#         # 计算累计解释方差
#         cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()
        
#         # 确定达到阈值的主成分数k
#         k = next(i for i, total_variance in enumerate(cumulative_explained_variance) if total_variance >= explained_variance_threshold) + 1
        
#         # 重新运行PCA，保留前k个主成分
#         pca = PCA(n_components=k)
#         pca_result = pca.fit_transform(data)
        
#         # 将PCA结果转换为DataFrame，并添加索引信息
#         pca_df = pd.DataFrame(pca_result, columns=[f'PC{j+1}_row_{i}' for j in range(k)])
        
#         # 合并PCA结果到总数据框
#         BRCA_RNAseq = pd.concat([BRCA_RNAseq, pca_df], axis=1)
#     else:
#         print(f"Row {i} does not have enough genes for PCA.")

# # 显示最终合并的PCA结果
# BRCA_RNAseq

In [23]:
# # 创建一个空的数据框来存储所有PCA结果
# BRCA_RNAseq = pd.DataFrame()

# # 对每一行的基因数据进行PCA
# for i, gene_list in enumerate(gene_ids):
#     # 找到Methylation数据框中与gene_list对应的列
#     matching_columns = [col for col in RNAseq.columns if col in gene_list]
#     # 提取这些列的数据
#     data = RNAseq[matching_columns]
    
#     # 确保数据足够进行PCA（至少需要两个基因）
#     if len(matching_columns) > 1:
#         # 初始化PCA，并指定n_components为1
#         pca = PCA(n_components=1)
#         pca_result = pca.fit_transform(data)
        
#         # 将PCA结果转换为DataFrame，并添加索引信息
#         pca_df = pd.DataFrame(pca_result, columns=[f'PC1_row_{i}'])
        
#         # 合并PCA结果到总数据框
#         BRCA_RNAseq = pd.concat([BRCA_RNAseq, pca_df], axis=1)
#     else:
#         print(f"Row {i} does not have enough genes for PCA.")

# # 显示最终合并的PCA结果
# BRCA_RNAseq


In [24]:
# 创建一个空的数据框来存储所有结果
BRCA_RNAseq = pd.DataFrame()

# 对每一行的基因数据进行拼接
for i, gene_list in enumerate(gene_ids):
    # 找到Methylation数据框中与gene_list对应的列
    matching_columns = [col for col in RNAseq.columns if col in gene_list]
    
    # 去重，保留唯一的基因列
    unique_columns = list(dict.fromkeys(matching_columns))  # 保留顺序的去重方法
    
    # 提取这些唯一列的数据
    data = RNAseq[unique_columns]
    
    # 确保有匹配到的基因列
    if len(unique_columns) > 0:
        # 检查是否有重复的列
        unique_columns_to_add = [col for col in unique_columns if col not in BRCA_RNAseq.columns]
        
        # 如果没有重复列，将新列的数据直接拼接
        if unique_columns_to_add:
            data_to_add = data[unique_columns_to_add]
            BRCA_RNAseq = pd.concat([BRCA_RNAseq, data_to_add], axis=1)
        else:
            print(f"Row {i} does not have new unique columns to add.")
    else:
        print(f"Row {i} does not have matching genes to concatenate.")

# 显示最终拼接的结果
BRCA_RNAseq


Row 6 does not have new unique columns to add.
Row 7 does not have new unique columns to add.
Row 8 does not have new unique columns to add.
Row 15 does not have new unique columns to add.
Row 17 does not have new unique columns to add.
Row 20 does not have new unique columns to add.
Row 28 does not have new unique columns to add.
Row 30 does not have new unique columns to add.
Row 34 does not have new unique columns to add.
Row 38 does not have new unique columns to add.
Row 44 does not have new unique columns to add.
Row 45 does not have new unique columns to add.
Row 48 does not have new unique columns to add.
Row 49 does not have new unique columns to add.
Row 51 does not have new unique columns to add.
Row 55 does not have new unique columns to add.
Row 58 does not have new unique columns to add.
Row 59 does not have new unique columns to add.
Row 65 does not have new unique columns to add.
Row 67 does not have new unique columns to add.
Row 71 does not have new unique columns to 

Unnamed: 0,ABL1,ACADM,ACAT1,ACO2,ADA,AKT1,AKT2,ALDH1A2,ALDH1A3,ALOX15B,...,SURF1,UQCR10,UQCR11,UQCRB,UQCRC2,UQCRFS1,UQCRHL,UQCRH,WDR93,CGB7
0,11.1922,9.8565,10.6013,11.0607,6.9935,11.8391,11.0237,5.5665,9.8314,8.3599,...,9.3959,9.3948,9.5402,11.0453,11.5041,10.3084,7.2733,9.3964,5.4703,1.01410
1,9.7755,8.5877,9.6203,11.5186,7.9402,12.8907,11.4956,4.8889,8.2605,5.8519,...,9.9592,11.0420,10.9705,12.3146,11.3483,11.9523,10.7069,12.8206,3.4653,5.16440
2,11.3155,10.4236,9.8686,11.3374,7.7755,11.6033,11.6151,6.9614,9.1619,12.6663,...,9.3351,9.8759,10.1307,11.3989,11.6124,10.0885,8.3823,10.2502,4.5986,2.39290
3,10.6537,11.4840,8.8371,11.3729,5.5148,12.2656,12.0471,5.8103,6.1662,3.7653,...,9.8281,10.5420,10.3690,11.6772,12.0496,11.1781,8.7979,10.6588,3.5877,0.39500
4,10.2955,9.0557,9.3345,10.4181,8.6322,12.0699,11.4128,1.9008,9.5649,4.5994,...,9.3037,10.5910,10.1829,12.3978,10.3072,9.6428,9.7032,11.9100,4.0035,0.75150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
765,10.4632,10.0001,11.4173,10.7544,7.8649,11.6967,10.9855,5.8199,7.9579,9.4490,...,9.9846,10.9903,10.8845,11.8077,11.8706,10.5575,8.4381,10.8952,6.7766,1.56260
766,11.6356,8.6519,10.7598,11.9914,7.5397,12.1868,12.8083,3.1336,8.6952,5.0729,...,9.7506,10.5215,10.0663,10.9821,11.9018,9.7408,7.7471,10.0362,5.9552,1.11530
767,11.4497,10.7140,10.3352,11.2001,6.6691,12.7403,11.6774,7.7411,8.1145,5.0381,...,9.3969,9.6158,10.3063,11.4330,11.1893,10.5109,9.0697,11.0288,3.6354,1.43788
768,10.4021,9.2604,10.6261,11.6921,8.7099,13.1232,11.0644,3.5396,6.0385,6.8356,...,10.5980,11.4647,11.2425,12.1177,12.2192,11.1764,9.0552,10.9535,5.6958,3.63070


In [25]:
BRCA_RNAseq.to_csv('../../DATA/BRCA/BRCA_RNAseq.csv', index=False)