In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.model_selection import StratifiedKFold


In [2]:
# Select strata
TUMOR_TYPE_COMBINATION = [
    #"BLCA",
    "BRCA",
    #"CESC",
    "COAD",
    #"DLBC",
    "GBM",
    #"HNSC",
    "KICH",
    "KIRC",
    "KIRP",
    #"LAML",
    "LGG",
    #"LIHC",
    #"LUAD",
    #"LUSC",
    #"OV",
    #"PRAD",
    "READ",
    #"SKCM",
    #"STAD",
    #"THCA",
    #"UCEC",
]

### Generating stacked data

In [3]:
def Stacking(DataSet):
    
    StackedTotal = DataSet.stack().reset_index()
    StackedTotal = StackedTotal.rename(columns={0:'GeneCount'})
    
    '''
    
    ## Log2 Transformation
    StackedTotal['GeneCount'] = np.log2(StackedTotal['GeneCount'] +1) 

    ## Normalization
    StackedTotal['GeneCount'] = (StackedTotal['GeneCount'] - StackedTotal['GeneCount'].min()) / (StackedTotal['GeneCount'].max() - StackedTotal['GeneCount'].min())
    '''
    
    return  StackedTotal #StackedTotal.sample(frac=1, random_state=1)




### Load dataset

In [5]:
# load
with open('./SourceData/rsubread/complete_data_merged.pickle', 'rb') as f:
    TotalData = pickle.load(f)



## Re stratification and selection

In [6]:
TUMOR_TYPE_COMBINATION = sorted([  "COLO", "BRCA", "GLIOMA", "KIPAN"]) # "COLO", "BRCA", "GLIOMA", "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "GBM", 'tumor_type' ] = "GLIOMA"
TotalData.loc[ TotalData['tumor_type'] == "LGG", 'tumor_type' ] = "GLIOMA"
TotalData.loc[ TotalData['tumor_type'] == "KIRP", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "KICH", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "KIRC", 'tumor_type' ] = "KIPAN"
TotalData.loc[ TotalData['tumor_type'] == "COAD", 'tumor_type' ] = "COLO"
TotalData.loc[ TotalData['tumor_type'] == "READ", 'tumor_type' ] = "COLO"

TotalData = TotalData[TotalData['tumor_type'].isin(TUMOR_TYPE_COMBINATION)].copy()


## Varialbes which have small values should be removed
RemCheckVar = TotalData.iloc[:, 4:].var()
RemCheckSum = TotalData.iloc[:, 4:].sum()

RemList =RemCheckSum[RemCheckSum<RemCheckSum.quantile(0.25)].index.to_list() # remove variables by quantile n%
TotalData = TotalData.drop(columns=RemList)

#RemList = RemCheckSum[ (RemCheckSum<RemCheckSum.quantile(0.25)) | (RemCheckVar<RemCheckVar.quantile(0.25)) ].index.to_list()
#TotalData = TotalData.drop(columns=RemList)


### Log transformation and Group based normalization

In [7]:
LogTotalData = pd.DataFrame(np.log2(TotalData.iloc[:, 4:].values + 1), columns=TotalData.iloc[:, 4:].columns)
LogTotalData = pd.concat([TotalData.iloc[:, :4].reset_index(drop=True), LogTotalData], axis=1)

In [8]:
TotalData = pd.DataFrame()

for Type in LogTotalData['tumor_type'].unique():
    print(Type)
    DataSub = LogTotalData[LogTotalData['tumor_type'] == Type].copy()
    
    SubMin = np.min(DataSub.iloc[:, 4:].values)
    SubMax = np.max(DataSub.iloc[:, 4:].values)
    #SubMin = DataSub.iloc[:, 4:].min()
    #SubMax = DataSub.iloc[:, 4:].max()
    
    ## Normalization
    DataSub.iloc[:, 4:] = (DataSub.iloc[:, 4:] -SubMin) / (SubMax - SubMin)
    TotalData = TotalData.append(DataSub)

TotalData = TotalData.sample(frac=1).reset_index(drop=True)
#TotalData = TotalData.reset_index(drop=True)
gene_counts = TotalData.iloc[:, 4:] 
LogAnalData = pd.merge(TotalData[['patient_id','tumor_type','time','event']],LogTotalData, on=['patient_id','tumor_type','time','event'] )    
    

GLIOMA
BRCA
COLO
KIPAN


### Feature generation

In [9]:
# TTE selection and generating the distance matrix
TTE = np.log(TotalData['time'].values.astype('float32'))
TTEXY  = np.matmul(TTE[:, None], TTE[None])
DisimInd = TTE[:, None]**2 + TTE[ None]**2 - 2 *TTEXY
DisimInd = np.sqrt(np.maximum(DisimInd, 1e-7))

Event = TotalData['event'].values.astype('int32')
GeneCount = gene_counts.reset_index(drop=True)
GeneCount.index = GeneCount.index+1


# generating gene to int map
GeneList = GeneCount.columns.to_list()
GeneToInt = { i : num+1 for num, i in enumerate(GeneList)}

IntToGene= { i+1 : GeneList[i] for i in range(0, len(GeneList)) }
GeneCount.columns = GeneToInt.values()

StakedgData = Stacking(GeneCount)
    

### Save

In [10]:
np.save('./ProcessedData/StakedgData_GroupNorm.npy',StakedgData)
np.save('./ProcessedData/GeneToInt_GroupNorm.npy',GeneToInt)
np.save('./ProcessedData/IntToGene_GroupNorm.npy',IntToGene)

np.save('./ProcessedData/DisimInd_GroupNorm.npy',DisimInd)
np.save('./ProcessedData/TTE_GroupNorm.npy',TTE)
np.save('./ProcessedData/Event_GroupNorm.npy',Event)
LogAnalData.to_pickle( "./ProcessedData/LogAnalData.pickle")

In [11]:
# load Merged data
with open('./ProcessedData/LogAnalData.pickle', 'rb') as f:
    LogAnalData = pickle.load(f)