In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import math
import sklearn.preprocessing as sk
import seaborn as sns
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from utils import AllTripletSelector,HardestNegativeTripletSelector, RandomNegativeTripletSelector, SemihardNegativeTripletSelector # Strategies for selecting triplets within a minibatch
from metrics import AverageNonzeroTripletsMetric
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import random
from random import randint
from sklearn.model_selection import StratifiedKFold

In [2]:
torch.manual_seed(1)

max_iter = 50

GDSCE = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/GDSC_exprs.Gemcitabine.eb_with.TCGA_exprs.Gemcitabine.tsv", 
                    sep = "\t", index_col=0, decimal = ",")
GDSCE = pd.DataFrame.transpose(GDSCE)

TCGAE = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/TCGA_exprs.Gemcitabine.eb_with.GDSC_exprs.Gemcitabine.tsv", 
                   sep = "\t", index_col=0, decimal = ",")
TCGAE = pd.DataFrame.transpose(TCGAE)

TCGAM = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/TCGA_mutations.Gemcitabine.tsv", 
                   sep = "\t", index_col=0, decimal = ".")
TCGAM = pd.DataFrame.transpose(TCGAM)
TCGAM = TCGAM.loc[:,~TCGAM.columns.duplicated()]

TCGAC = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/TCGA_CNA.Gemcitabine.tsv", 
                   sep = "\t", index_col=0, decimal = ".")
TCGAC = pd.DataFrame.transpose(TCGAC)
TCGAC = TCGAC.loc[:,~TCGAC.columns.duplicated()]

GDSCM = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/GDSC_mutations.Gemcitabine.tsv", 
                    sep = "\t", index_col=0, decimal = ".")
GDSCM = pd.DataFrame.transpose(GDSCM)
GDSCM = GDSCM.loc[:,~GDSCM.columns.duplicated()]

GDSCC = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/GDSC_CNA.Gemcitabine.tsv", 
                    sep = "\t", index_col=0, decimal = ".")
GDSCC.drop_duplicates(keep='last')
GDSCC = pd.DataFrame.transpose(GDSCC)
GDSCC = GDSCC.loc[:,~GDSCC.columns.duplicated()]

selector = VarianceThreshold(0.05)
selector.fit_transform(GDSCE)
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]

TCGAC = TCGAC.fillna(0)
TCGAC[TCGAC != 0.0] = 1
TCGAM = TCGAM.fillna(0)
TCGAM[TCGAM != 0.0] = 1
GDSCM = GDSCM.fillna(0)
GDSCM[GDSCM != 0.0] = 1
GDSCC = GDSCC.fillna(0)
GDSCC[GDSCC != 0.0] = 1

ls = GDSCE.columns.intersection(GDSCM.columns)
ls = ls.intersection(GDSCC.columns)
ls = ls.intersection(TCGAE.columns)
ls = ls.intersection(TCGAM.columns)
ls = ls.intersection(TCGAC.columns)
ls2 = GDSCE.index.intersection(GDSCM.index)
ls2 = ls2.intersection(GDSCC.index)
ls3 = TCGAE.index.intersection(TCGAM.index)
ls3 = ls3.intersection(TCGAC.index)
ls = pd.unique(ls)

TCGAE = TCGAE.loc[ls3,ls]
TCGAM = TCGAM.loc[ls3,ls]
TCGAC = TCGAC.loc[ls3,ls]
GDSCE = GDSCE.loc[ls2,ls]
GDSCM = GDSCM.loc[ls2,ls]
GDSCC = GDSCC.loc[ls2,ls]

GDSCR = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/GDSC_response.Gemcitabine.tsv", 
                    sep = "\t", index_col=0, decimal = ",")
TCGAR = pd.read_csv("/common/statsgeneral/gayara/MOLI/Gemcitabine_TCGA/all_data/TCGA_response.Gemcitabine.tsv", 
                       sep = "\t", index_col=0, decimal = ",")

GDSCR.rename(mapper = str, axis = 'index', inplace = True)
GDSCR = GDSCR.loc[ls2,:]
#GDSCR.loc[GDSCR.iloc[:,0] == 'R','response'] = 0
#GDSCR.loc[GDSCR.iloc[:,0] == 'S','response'] = 1

TCGAR = TCGAR.loc[ls3,:]
#TCGAR.loc[TCGAR.iloc[:,1] == 'R','response'] = 0
#TCGAR.loc[TCGAR.iloc[:,1] == 'S','response'] = 1

d = {"R":0,"S":1}
GDSCR["response"] = GDSCR.loc[:,"response"].apply(lambda x: d[x])
TCGAR["response"] = TCGAR.loc[:,"response"].apply(lambda x: d[x])

Y_train = GDSCR['response'].values
Y_test = TCGAR['response'].values

In [3]:
mbs = 62
hdm = 256
zdm = 16
lre = 0.1
lrm = 0.1
lrc = 0.05
lrCL = 0.005
epch = 50
wd = 0.1
rate = 0.3

X_trainE = GDSCE.values
X_testE =  TCGAE.values
X_trainM = GDSCM.values
X_testM = TCGAM.values
X_trainC = GDSCC.values
X_testC = TCGAC.values
y_trainE = Y_train
y_testE = Y_test
      
# standardize the PDX data separate
scalerGDSC = sk.StandardScaler()
scalerGDSC.fit(X_trainE)
X_trainE = scalerGDSC.transform(X_trainE)
X_testE = scalerGDSC.transform(X_testE)
# Notice that only expression data is standardized
# This is as the mutation and the CNA data used here are binary

X_trainM = np.nan_to_num(X_trainM)
X_trainC = np.nan_to_num(X_trainC)
X_testM = np.nan_to_num(X_testM)
X_testC = np.nan_to_num(X_testC)
# np.nan_to_numpy Replace NaN with zero and infinity with large finite numbers
        
TX_testE = torch.FloatTensor(X_testE)
TX_testM = torch.FloatTensor(X_testM)
TX_testC = torch.FloatTensor(X_testC)
ty_testE = torch.FloatTensor(y_testE.astype(int))

    #Train
class_sample_count = np.array([len(np.where(y_trainE==t)[0]) for t in np.unique(y_trainE)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in y_trainE])

samples_weight = torch.from_numpy(samples_weight)
sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight), replacement=True)

mb_size = mbs

trainDataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_trainE), torch.FloatTensor(X_trainM), 
                                                      torch.FloatTensor(X_trainC), torch.FloatTensor(y_trainE.astype(int)))

trainLoader = torch.utils.data.DataLoader(dataset = trainDataset, batch_size=mb_size, shuffle=False, num_workers=1, sampler = sampler)

n_sampE, IE_dim = X_trainE.shape
n_sampM, IM_dim = X_trainM.shape
n_sampC, IC_dim = X_trainC.shape

h_dim = hdm
Z_dim = zdm
Z_in = h_dim + h_dim + h_dim
lrE = lre
lrM = lrm
lrC = lrc
epoch = epch

costtr = []
auctr = []
costts = []
aucts = []

class AEE(nn.Module):
    def __init__(self):
        super(AEE, self).__init__()
        self.EnE = torch.nn.Sequential(
            nn.Linear(IE_dim, h_dim),
            nn.BatchNorm1d(h_dim),
            nn.ReLU(),
            nn.Dropout())
    def forward(self, x):
        output = self.EnE(x)
        return output

class AEM(nn.Module):
    def __init__(self):
        super(AEM, self).__init__()
        self.EnM = torch.nn.Sequential(
            nn.Linear(IM_dim, h_dim),
            nn.BatchNorm1d(h_dim),
            nn.ReLU(),
            nn.Dropout())
    def forward(self, x):
        output = self.EnM(x)
        return output    


class AEC(nn.Module):
    def __init__(self):
        super(AEC, self).__init__()
        self.EnC = torch.nn.Sequential(
            nn.Linear(IM_dim, h_dim),
            nn.BatchNorm1d(h_dim),
            nn.ReLU(),
            nn.Dropout())
    def forward(self, x):
        output = self.EnC(x)
        return output      
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.FC = torch.nn.Sequential(
            nn.Linear(Z_in, Z_dim),
            nn.ReLU(),
            nn.Dropout(rate),
            nn.Linear(Z_dim, 1),
            nn.Dropout(rate),
            nn.Sigmoid())
    def forward(self, x):
        return self.FC(x)
        
torch.cuda.manual_seed_all(42)

AutoencoderE = AEE()
AutoencoderM = AEM()
AutoencoderC = AEC()

solverE = optim.Adagrad(AutoencoderE.parameters(), lr=lrE)
solverM = optim.Adagrad(AutoencoderM.parameters(), lr=lrM)
solverC = optim.Adagrad(AutoencoderC.parameters(), lr=lrC)

Clas = Classifier()
SolverClass = optim.Adagrad(Clas.parameters(), lr=lrCL, weight_decay = wd)
C_loss = torch.nn.BCELoss()

for it in range(epoch):

    epoch_cost4 = 0
    epoch_cost3 = []
    num_minibatches = int(n_sampE / mb_size) 

    for i, (dataE, dataM, dataC, target) in enumerate(trainLoader):
        flag = 0
        AutoencoderE.train()
        AutoencoderM.train()
        AutoencoderC.train()
        Clas.train()
                
        if torch.mean(target)!=0. and torch.mean(target)!=1.:                      

            ZEX = AutoencoderE(dataE)
            ZMX = AutoencoderM(dataM)
            ZCX = AutoencoderC(dataC)

            ZT = torch.cat((ZEX, ZMX, ZCX), 1)
            ZT = F.normalize(ZT, p=2, dim=0)

            Pred = Clas(ZT)
            loss = C_loss(Pred,target.view(-1,1))   

            y_true = target.view(-1,1)
            y_pred = Pred
            AUC = roc_auc_score(y_true.detach().numpy(),y_pred.detach().numpy()) 

            solverE.zero_grad()
            solverM.zero_grad()
            solverC.zero_grad()
            SolverClass.zero_grad()

            loss.backward()

            solverE.step()
            solverM.step()
            solverC.step()
            SolverClass.step()
                    
            epoch_cost4 = epoch_cost4 + (loss / num_minibatches)
            epoch_cost3.append(AUC)
            flag = 1

    if flag == 1:
        costtr.append(torch.mean(epoch_cost4))
        auctr.append(np.mean(epoch_cost3))
        print('Iter-{}; Total loss: {:.4}'.format(it, loss))

with torch.no_grad():

    AutoencoderE.eval()
    AutoencoderM.eval()
    AutoencoderC.eval()
    Clas.eval()

    ZET = AutoencoderE(TX_testE)
    ZMT = AutoencoderM(TX_testM)
    ZCT = AutoencoderC(TX_testC)

    ZTT = torch.cat((ZET, ZMT, ZCT), 1)
    ZTT = F.normalize(ZTT, p=2, dim=0)

    PredT = Clas(ZTT)
    lossT = C_loss(PredT,ty_testE.view(-1,1))         

    y_truet = ty_testE.view(-1,1)
    y_predt = PredT
    AUCt = roc_auc_score(y_truet.detach().numpy(),y_predt.detach().numpy())

    costts.append(lossT)
    aucts.append(AUCt)

Iter-0; Total loss: 0.6392
Iter-1; Total loss: 0.598
Iter-2; Total loss: 0.5438
Iter-3; Total loss: 0.5512
Iter-4; Total loss: 0.437
Iter-5; Total loss: 0.4123
Iter-6; Total loss: 0.3254
Iter-7; Total loss: 0.3604
Iter-8; Total loss: 0.3648
Iter-9; Total loss: 0.376
Iter-10; Total loss: 0.3156
Iter-11; Total loss: 0.3144
Iter-12; Total loss: 0.3077
Iter-13; Total loss: 0.2435
Iter-14; Total loss: 0.3523
Iter-15; Total loss: 0.3047
Iter-16; Total loss: 0.2688
Iter-17; Total loss: 0.276
Iter-18; Total loss: 0.3155
Iter-19; Total loss: 0.3372
Iter-20; Total loss: 0.3076
Iter-21; Total loss: 0.3182
Iter-22; Total loss: 0.3175
Iter-23; Total loss: 0.3406
Iter-24; Total loss: 0.3316
Iter-25; Total loss: 0.2669
Iter-26; Total loss: 0.2853
Iter-27; Total loss: 0.3152
Iter-28; Total loss: 0.3329
Iter-29; Total loss: 0.3599
Iter-30; Total loss: 0.2809
Iter-31; Total loss: 0.2732
Iter-32; Total loss: 0.3056
Iter-33; Total loss: 0.4015
Iter-34; Total loss: 0.2161
Iter-35; Total loss: 0.3919
Iter-3

In [4]:
AUCt

0.6018518518518519