In [1]:
import pandas as pd
import numpy as np

## mRNA data

In [2]:
mrna_raw=pd.read_csv('TCGA_inter_SNUH_clinical_standardized_combat_quantile_data.csv')

In [3]:
mrna_raw=mrna_raw.iloc[:-70,:]

## Non-coding RNA

In [5]:
lnc_raw=pd.read_csv("TCGA_inter_SNUH_clinical_lnc_standardized_combat_quantile_data.csv")

In [6]:
lnc_raw=lnc_raw.iloc[:-70,:]

## Survival patients

In [8]:
survival_raw=pd.read_csv("TCGA_OV_survival_reference.csv")

## Vital status

In [9]:
vital_raw=pd.read_csv('balanced_index_230.csv')

## Handle N/A

In [11]:
mrna_raw=mrna_raw.dropna()
lnc_raw=lnc_raw.dropna()
survival_raw=survival_raw.dropna()

In [12]:
mrna_raw=mrna_raw.drop_duplicates()
lnc_raw=lnc_raw.drop_duplicates()

In [13]:
mrna_patient_idx=mrna_raw.patient.values
lnc_patient_idx=lnc_raw.patient.values
print(len(mrna_patient_idx))
print(len(lnc_patient_idx))

426
426


In [14]:
print(mrna_raw.shape)
print(lnc_raw.shape)
print(survival_raw.shape)

(426, 22205)
(426, 1886)
(485, 2)


## Check an existance of lnc data in mrna

In [15]:
mrna_raw.loc[mrna_raw.patient.isin(lnc_patient_idx)].shape

(426, 22205)

## Extract existed idx from survival raw data

In [16]:
existed_survival=survival_raw.loc[survival_raw.patient.isin(lnc_patient_idx),:]

In [17]:
len(existed_survival)

408

In [18]:
mrna_col=mrna_raw.columns.values
lnc_col=lnc_raw.columns.values


In [19]:
#Remove 'Patients' columns in lnc_col
lnc_col=lnc_col[1:]

In [20]:
intersection_col=[]
for i in mrna_col:
    if i in lnc_col:
        intersection_col.append(i)

In [21]:
#Delete overlapped columns in mrna data
del_inter_mrna_raw=mrna_raw.drop(columns=intersection_col)

In [22]:
test_mrna=del_inter_mrna_raw.loc[del_inter_mrna_raw.patient.isin(existed_survival.patient.values),:]
test_lnc=lnc_raw.loc[lnc_raw.patient.isin(existed_survival.patient.values),:]

In [23]:
print(len(del_inter_mrna_raw))
print(len(lnc_raw))
print(len(test_mrna))
print(len(test_lnc))

426
426
413
413


In [24]:
## Delete duplicated patients id

## Delete duplicated patients id

In [25]:
del_inter_mrna_raw=del_inter_mrna_raw.drop_duplicates(subset='patient')
lnc_raw=lnc_raw.drop_duplicates(subset='patient')
test_lnc=test_lnc.drop_duplicates(subset='patient')
test_mrna=test_mrna.drop_duplicates(subset='patient')

In [28]:
lnc_data=pd.merge(test_lnc,existed_survival,on=['patient'])

In [29]:
mrna_data=pd.merge(test_mrna,existed_survival,on=['patient'])

# Model Class

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim

from itertools import *
from tqdm import tqdm

## Highway net

In [32]:
class Highway(nn.Module):
    def __init__(self,size,num_layers,f):
        super().__init__()
        self.num_layers=num_layers
        self.nonlinear=nn.ModuleList([nn.Linear(size,size) for _ in range(num_layers)])
        self.linear=nn.ModuleList([nn.Linear(size,size) for _ in range(num_layers)])
        self.gate=nn.ModuleList([nn.Linear(size,size) for _ in range(num_layers)])
        self.f=f
    def forward(self,x):
        for layer in range(self.num_layers):
            gate=F.sigmoid(self.gate[layer](x))
            nonlinear=self.f(self.nonlinear[layer](x))
            linear=self.linear[layer](x)
            x=gate*nonlinear+(1-gate)*linear
        return x
                       

## Similarity loss

In [158]:
def Similarity_loss(modalities):
    mode_rna=modalities['rna'].detach()
    #print(f'mode_rna:{mode_rna.shape}')
    #print(mode_rna[0,:])
    mode_lnc=modalities['lnc'].detach()
    #print(f'mode_lnc:{mode_lnc.shape}')
    cos=nn.CosineSimilarity(dim=1,eps=1e-6)
    M=0.1
    N=mode_rna.shape[0]
    loss=[]
    #sim(x,x) output=distance of num of samples
    sim_same=cos(mode_rna,mode_lnc)
    avg_sim_same=torch.sum(sim_same)/N
    tmp=torch.clone(mode_lnc[0,:])
    mode_lnc[0:-1,:]=mode_lnc[1:,:]
    mode_lnc[-1,:]=tmp
    #print(mode_rna.shape[0]-1)
    for i in range(mode_rna.shape[0]-1):
        #sim(x,y) output=distance of num of samples
        sim_diff=cos(mode_rna,mode_lnc)
        #print(f'sim diff :{sim_diff}')
        avg_sim_diff=torch.sum(sim_diff)/N
        #L_theta(x,y)=max(M-sim(x,y)+sim(x,x))
        L_theta_x_y=max(0,M-avg_sim_diff+avg_sim_same)
        loss.append(L_theta_x_y)
        #print(L_theta_x_y)
        #shift y data
        tmp=torch.clone(mode_lnc[0,:])
        mode_lnc[0:-1,:]=mode_lnc[1:,:]
        mode_lnc[-1,:]=tmp
    total_loss=sum(loss)/len(loss)
    return total_loss
        
        
    

## Train Network

In [238]:
class MultiNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fcm=nn.Linear(20319,256)
        self.fcl=nn.Linear(1885,256)
        self.highway=Highway(256,10,f=F.relu)
        self.fc2=nn.Linear(512,2)
        self.fcd=nn.Linear(512,1)
        self.bn1=nn.BatchNorm1d(256)
        self.bn2=nn.BatchNorm1d(256)
        self.bn3=nn.BatchNorm1d(1)
    
    def forward(self,data):
        x=data['mRNA']
        #print(x)
        x=x.view(x.shape[0],-1)
        x=F.dropout(x,0.4)
        x=F.tanh(self.fcm(x))
        x=self.bn1(x)
        x=F.dropout(x,0.5,training=self.training)
        x=self.highway(x)
        x=self.bn2(x)
        
        y=data['lnc']
        #print(y)
        y=y.view(y.shape[0],-1)        
        y=F.dropout(y,0.4)
        y=F.tanh(self.fcl(y))
        y=self.bn1(y)
        y=F.dropout(y,0.5,training=self.training)
        y=self.highway(y)
        y=self.bn2(y)
        
        modal={'rna':x.clone(),'lnc':y.clone()}
        #similarity loss
        sim_loss=Similarity_loss(modal)
        #print(sim_loss.requires_grad)
        #concatenates x and y
        concat_x_y=torch.cat((x,y),1)
        
        #vital status
        #score=F.log_softmax(self.fc2(concat_x_y),dim=1)
        #survival
        hazard=self.fcd(concat_x_y)
        
        #print(f'hazard:{hazard}type:{hazard.requires_grad}')
        return {'sim_loss':sim_loss,'hazard':hazard}
    def loss(self,pred,target):
        
        days_to_death=target
        hazard=pred['hazard'].squeeze()
        
        _,idx=torch.sort(days_to_death)
        hazard_probs=F.softmax(hazard[idx].squeeze())
        hazard_cum=torch.stack([torch.tensor(0.0)]+list(accumulate(hazard_probs)))
        N=hazard_probs.shape[0]
        weights_cum=torch.range(1,N)
        p,q=hazard_cum[1:],1-hazard_cum[:-1]
        
        w1,w2=weights_cum,N-weights_cum
        probs=torch.stack([p,q],dim=1)
        logits=torch.log(probs)
        ll1 = (F.nll_loss(logits, torch.zeros(N).long(), reduce=False) * w1)/N
        ll2 = (F.nll_loss(logits, torch.ones(N).long(), reduce=False) * w2)/N
        loss2 = torch.mean(ll1 + ll2)
        
        
        loss1=pred['sim_loss']
        
        return loss1+loss2

        
        

## Train/Test split 98/2 ratio

In [239]:
from sklearn.model_selection import train_test_split

In [240]:
mrna_train,mrna_test=train_test_split(mrna_data,test_size=0.02,random_state=777)
lnc_train,lnc_test=train_test_split(lnc_data,test_size=0.02,random_state=777)

In [241]:
print(mrna_train.shape)
print(lnc_test.shape)

(399, 20321)
(9, 1887)


In [242]:
whole_data={'mRNA':mrna_train.iloc[:,1:].to_numpy(),'lnc':lnc_train.iloc[:,1:].to_numpy()}

## Data Generate

In [243]:
from torch.autograd import Variable
from torch.utils.data import Dataset,DataLoader

In [244]:
class GenerateData(Dataset):
    def __init__(self,dataset):
        mrna_data=dataset['mRNA']
        lnc_data=dataset['lnc']
        self.len=mrna_data.shape[0]
        self.rna_x=torch.from_numpy(mrna_data[:,0:-1]).float()
        self.rna_y=torch.from_numpy(mrna_data[:,-1]).float()
        self.lnc_x=torch.from_numpy(lnc_data[:,0:-1]).float()
        self.lnc_y=torch.from_numpy(lnc_data[:,-1]).float()
        
    def __getitem__(self,index):
        data={'mRNA':self.rna_x[index],'lnc':self.lnc_x[index],'mRNA_y':self.rna_y[index],'lnc_y':self.lnc_y[index]}
        return data
    
    def __len__(self):
        return self.len
    

In [245]:
dataset=GenerateData(whole_data)
train_loader=DataLoader(dataset=dataset,batch_size=16,shuffle=False,num_workers=2)

## Main

In [246]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

In [247]:
model=MultiNet()
model.cpu()
    
        

MultiNet(
  (fcm): Linear(in_features=20319, out_features=256, bias=True)
  (fcl): Linear(in_features=1885, out_features=256, bias=True)
  (highway): Highway(
    (nonlinear): ModuleList(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=256, bias=True)
      (2): Linear(in_features=256, out_features=256, bias=True)
      (3): Linear(in_features=256, out_features=256, bias=True)
      (4): Linear(in_features=256, out_features=256, bias=True)
      (5): Linear(in_features=256, out_features=256, bias=True)
      (6): Linear(in_features=256, out_features=256, bias=True)
      (7): Linear(in_features=256, out_features=256, bias=True)
      (8): Linear(in_features=256, out_features=256, bias=True)
      (9): Linear(in_features=256, out_features=256, bias=True)
    )
    (linear): ModuleList(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=256, bias=True)
      (2): L

In [248]:
use_cuda

True

In [249]:
learning_rate=1e-4
optimizer=optim.Adam(model.parameters(),lr=learning_rate)

In [250]:
for i,data in enumerate(train_loader):
    print(f'# of i :{i}')
    with torch.autograd.set_detect_anomaly(True):
        optimizer.zero_grad()
        output=model(data)
        loss=model.loss(output,data['mRNA_y'])
        loss.backward()
        optimizer.step()

# of i :0




# of i :1
# of i :2
# of i :3
# of i :4
# of i :5
# of i :6
# of i :7
# of i :8
# of i :9
# of i :10
# of i :11
# of i :12
# of i :13
# of i :14
# of i :15
# of i :16
# of i :17
# of i :18
# of i :19
# of i :20
# of i :21
# of i :22
# of i :23
# of i :24


In [None]:
#Train,Test split 99/1
#Generate Instance
#Train
#Optim_zero_grad
#Loss
#Backward
#Optimizer.step()