In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import StratifiedKFold


In [2]:
# Select strata
TUMOR_TYPE_COMBINATION = [
    #"BLCA",
    "BRCA",
    #"CESC",
    "COAD",
    #"DLBC",
    "GBM",
    #"HNSC",
    "KICH",
    "KIRC",
    "KIRP",
    #"LAML",
    "LGG",
    #"LIHC",
    #"LUAD",
    #"LUSC",
    #"OV",
    #"PRAD",
    "READ",
    #"SKCM",
    #"STAD",
    #"THCA",
    #"UCEC",
]

### Generating stacked data

In [3]:
def Stacking(DataSet):
    
    StackedTotal = DataSet.stack().reset_index()
    StackedTotal = StackedTotal.rename(columns={0:'GeneCount'})
    
    
    return  StackedTotal #StackedTotal.sample(frac=1, random_state=1)




### Load dataset

In [4]:
# load
with open('./SourceData/rsubread/complete_data_merged.pickle', 'rb') as f:
    TotalData = pickle.load(f)



## Re stratification and selection

In [5]:
TUMOR_TYPE_COMBINATION = sorted([  "COLO", "BRCA", "GLIOMA", "KIPAN"]) # "COLO", "BRCA", "GLIOMA", "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "GBM", 'tumor_type' ] = "GLIOMA"
TotalData.loc[ TotalData['tumor_type'] == "LGG", 'tumor_type' ] = "GLIOMA"
TotalData.loc[ TotalData['tumor_type'] == "KIRP", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "KICH", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "KIRC", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "COAD", 'tumor_type' ] = "COLO"
TotalData.loc[ TotalData['tumor_type'] == "READ", 'tumor_type' ] = "COLO"

TotalData = TotalData[TotalData['tumor_type'].isin(TUMOR_TYPE_COMBINATION)].copy()


## Varialbes which have small values should be removed
RemCheckVar = TotalData.iloc[:, 4:].var()
RemCheckSum = TotalData.iloc[:, 4:].sum()

RemList =RemCheckVar[RemCheckVar<RemCheckVar.quantile(0.25)].index.to_list() # remove variables by quantile n%
TotalData = TotalData.drop(columns=RemList)

#RemList = RemCheckSum[ (RemCheckSum<RemCheckSum.quantile(0.25)) | (RemCheckVar<RemCheckVar.quantile(0.25)) ].index.to_list()
#TotalData = TotalData.drop(columns=RemList)


### Log transformation and normalization

In [6]:
LogTotalData = pd.DataFrame(np.log2(TotalData.iloc[:, 4:].values + 1), columns=TotalData.iloc[:, 4:].columns)
LogTotalData = pd.concat([TotalData.iloc[:, :4].reset_index(drop=True), LogTotalData], axis=1)

In [7]:

TotalData = pd.DataFrame()

for Type in LogTotalData['tumor_type'].unique():
    print(Type)
    DataSub = LogTotalData[LogTotalData['tumor_type'] == Type].copy()
    
    SubMin = np.min(DataSub.iloc[:, 4:].values)
    SubMax = np.max(DataSub.iloc[:, 4:].values)
    #SubMin = DataSub.iloc[:, 4:].min()
    #SubMax = DataSub.iloc[:, 4:].max()
    
    ## Normalization
    DataSub.iloc[:, 4:] = (DataSub.iloc[:, 4:] -SubMin) / (SubMax - SubMin)
    TotalData = TotalData.append(DataSub)
'''

## Normalization
TotalData = LogTotalData.copy()
Min = TotalData.iloc[:, 4:].min()
Max = TotalData.iloc[:, 4:].max()
TotalData.iloc[:, 4:] = (TotalData.iloc[:, 4:] -Min) / (Max - Min)
'''

# Sort
TotalData = pd.concat([TotalData[TotalData['event']==True].sort_values('time' , ascending=True), TotalData[TotalData['event']==False].sort_values('time' , ascending=True)], axis=0)
TotalData = TotalData.reset_index(drop=True)
gene_counts = TotalData.iloc[:, 4:] 
LogAnalData = pd.merge(TotalData[['patient_id','tumor_type','time','event']],LogTotalData, on=['patient_id','tumor_type','time','event'] )    
    

GLIOMA
BRCA
COLO
KIPAN


In [8]:
TotalData

Unnamed: 0,patient_id,tumor_type,time,event,A1BG,A1CF,A2LD1,A2M,A2ML1,A4GALT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,TCGA-BP-4337-01A-01R-1289-07,KIPAN,2,True,0.372765,0.296615,0.364335,0.754706,0.176668,0.581997,...,0.396232,0.410301,0.390014,0.406087,0.532682,0.107742,0.490680,0.630963,0.540191,0.473624
1,TCGA-19-2624-01A-01R-1850-01,GLIOMA,5,True,0.388842,0.124311,0.298515,0.624113,0.282289,0.305840,...,0.482360,0.503715,0.364162,0.413671,0.517444,0.114463,0.524488,0.626440,0.528998,0.473164
2,TCGA-41-4097-01A-01R-1850-01,GLIOMA,6,True,0.365222,0.070183,0.329208,0.663275,0.344271,0.351698,...,0.379477,0.357408,0.347401,0.399759,0.488959,0.114463,0.460271,0.573262,0.502158,0.437810
3,TCGA-HT-7616-01A-11R-2256-07,GLIOMA,7,True,0.259394,0.132841,0.276661,0.690430,0.293551,0.351698,...,0.343978,0.340671,0.403560,0.440388,0.527805,0.044280,0.502232,0.533708,0.560698,0.443363
4,TCGA-B0-4813-01A-01R-1277-07,KIPAN,18,True,0.331288,0.522034,0.375273,0.666009,0.212750,0.550322,...,0.377690,0.419549,0.367172,0.389019,0.470017,0.203811,0.457788,0.601250,0.466457,0.470671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2937,TCGA-GM-A2DA-01A-11R-A18M-07,BRCA,5909,False,0.365059,0.069453,0.273783,0.596392,0.138905,0.393257,...,0.367561,0.444871,0.274609,0.345949,0.441094,0.279353,0.420011,0.525123,0.433788,0.419349
2938,TCGA-DU-6392-01A-11R-1708-07,GLIOMA,6423,False,0.387522,0.102816,0.309963,0.737942,0.245959,0.393056,...,0.469286,0.486458,0.410233,0.469080,0.548212,0.343978,0.567444,0.640349,0.572636,0.502010
2939,TCGA-B6-A0RE-01A-11R-A056-07,BRCA,6435,False,0.363267,0.043820,0.338037,0.597876,0.566444,0.415755,...,0.457789,0.497468,0.378296,0.459177,0.517835,0.425525,0.477905,0.614130,0.485792,0.486716
2940,TCGA-B6-A0IA-01A-11R-A034-07,BRCA,6719,False,0.465456,0.000000,0.318429,0.490982,0.123018,0.510251,...,0.447908,0.530550,0.331709,0.394870,0.493322,0.322791,0.477475,0.587375,0.477042,0.438751


### Feature generation

In [18]:
# TTE selection and generating the distance matrix
TTE = np.log(TotalData['time'].values.astype('float32'))
TTEXY  = np.matmul(TTE[:, None], TTE[None])
DisimInd = TTE[:, None]**2 + TTE[ None]**2 - 2 *TTEXY
DisimInd = np.sqrt(np.maximum(DisimInd, 1e-7))

Event = TotalData['event'].values.astype('int32')
GeneCount = gene_counts.reset_index(drop=True)
GeneCount.index = GeneCount.index+1


# generating gene to int map
GeneList = GeneCount.columns.to_list()
GeneToInt = { i : num+1 for num, i in enumerate(GeneList)}

IntToGene= { i+1 : GeneList[i] for i in range(0, len(GeneList)) }
GeneCount.columns = GeneToInt.values()

StakedgData = Stacking(GeneCount)
    

### Save

In [19]:
np.save('./ProcessedData/StakedgData_GroupNorm.npy',StakedgData)
np.save('./ProcessedData/GeneToInt_GroupNorm.npy',GeneToInt)
np.save('./ProcessedData/IntToGene_GroupNorm.npy',IntToGene)

np.save('./ProcessedData/DisimInd_GroupNorm.npy',DisimInd)
np.save('./ProcessedData/TTE_GroupNorm.npy',TTE)
np.save('./ProcessedData/Event_GroupNorm.npy',Event)
LogAnalData.to_pickle( "./ProcessedData/LogAnalData.pickle")

In [9]:
# load Merged data
with open('./ProcessedData/LogAnalData.pickle', 'rb') as f:
    LogAnalData = pickle.load(f)