In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import math
import sklearn.preprocessing as sk
import seaborn as sns
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
import random
from sklearn.model_selection import StratifiedKFold

In [2]:
save_results_to = '/common/statsgeneral/gayara/MOLI/Cetuximab/results/classifier_only/'
torch.manual_seed(42)

<torch._C.Generator at 0x14c8dc028b10>

In [3]:
max_iter = 100

In [4]:
GDSCE = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/GDSC_exprs.Cetuximab.eb_with.PDX_exprs.Cetuximab.tsv", 
                    sep = "\t", index_col=0, decimal = ",")
GDSCE = pd.DataFrame.transpose(GDSCE)

In [5]:
GDSCE = GDSCE.drop_duplicates()

In [6]:
GDSCE.head()

ENTREZID,1,2,9,10,12,13,14,15,16,18,...,100507206,100507254,100507436,100507472,100526773,100527978,100532746,100820829,102724473,105375355
683665,3.567759,3.44439,7.410196,2.901726,2.981935,2.807028,7.649455,2.829688,7.895537,3.102168,...,2.711407,2.788156,5.850589,2.987299,2.895857,2.686677,2.586782,2.949731,5.572678,2.960097
684052,4.031647,3.119876,5.348844,3.039942,2.826096,2.767429,8.136493,2.738326,9.482195,5.646555,...,2.530302,2.880834,7.653155,3.062286,2.923641,2.318554,2.840805,3.177785,3.865416,2.374161
684055,5.422951,3.289388,6.26694,2.99011,2.78832,2.614472,8.263573,2.336854,10.224379,5.981012,...,2.596334,2.857835,5.78061,2.6148,2.939208,2.99516,2.063917,2.724124,3.812784,2.655778
684057,4.706813,6.557659,5.506414,3.092901,3.016461,2.970122,8.513847,2.744526,8.400158,3.482531,...,2.601068,3.211715,8.236083,2.799498,2.933778,2.320755,2.259224,3.03471,3.892887,2.542773
684059,4.504268,4.843109,5.670954,2.978294,2.952328,3.317617,7.819232,2.642423,9.457529,3.797871,...,2.599445,3.187923,7.290323,3.002865,2.940937,2.837265,2.964874,2.668278,3.869515,2.719021


In [7]:
GDSCE.shape

(861, 18232)

In [8]:
GDSCR = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/GDSC_response.Cetuximab.tsv", 
                    sep = "\t", index_col=0, decimal = ",")

In [9]:
GDSCR.head()

Unnamed: 0_level_0,response,logIC50,drug,exprs,CNA,mutations
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
683665,R,6.29444657935625,Cetuximab,1,1,1
684052,R,6.3873236983719,Cetuximab,1,1,1
684055,S,4.9521245559495,Cetuximab,1,1,1
684057,R,6.3935613853243,Cetuximab,1,1,1
684059,R,5.91890283815806,Cetuximab,1,1,1


In [10]:
# note that the index in then two dataframes are the unique rows 

In [11]:
GDSCR_index = list(GDSCR.index.unique())
GDSCE_index = list(GDSCE.index.unique())

In [12]:
# GDSCR_index

In [13]:
print(len(GDSCE_index))
print(len(GDSCR_index))

861
856


In [14]:
# note that common elements are shared but one list is trings and the other is integers, therefore a conversion to a single type is necessary before trying to get the
# common elements - make the index of GDSCR a character.

In [15]:
GDSCR.index = GDSCR.index.map(str)

In [16]:
GDSCR.index

Index(['683665', '684052', '684055', '684057', '684059', '684062', '684072',
       '684681', '687452', '687455',
       ...
       '1524416', '1524417', '1524418', '1524419', '1659817', '1659823',
       '1660034', '1660035', '1660036', '1674021'],
      dtype='object', name='sample_name', length=856)

In [17]:
PDXE = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/PDX_exprs.Cetuximab.eb_with.GDSC_exprs.Cetuximab.tsv", 
                   sep = "\t", index_col=0, decimal = ",")
PDXE = pd.DataFrame.transpose(PDXE)

In [18]:
PDXM = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/PDX_mutations.Cetuximab.tsv", 
                   sep = "\t", index_col=0, decimal = ".")
PDXM = pd.DataFrame.transpose(PDXM)

PDXC = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/PDX_CNA.Cetuximab.tsv", 
                   sep = "\t", index_col=0, decimal = ".")
PDXC.drop_duplicates(keep='last')
PDXC = pd.DataFrame.transpose(PDXC)

GDSCM = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/GDSC_mutations.Cetuximab.tsv", 
                    sep = "\t", index_col=0, decimal = ".")
GDSCM = pd.DataFrame.transpose(GDSCM)


GDSCC = pd.read_csv("/common/statsgeneral/gayara/MOLI/Cetuximab/all_data/GDSC_CNA.Cetuximab.tsv", 
                    sep = "\t", index_col=0, decimal = ".")
GDSCC.drop_duplicates(keep='last')
GDSCC = pd.DataFrame.transpose(GDSCC)

In [19]:
GDSCM.shape

(856, 18421)

In [20]:
GDSCM.head()

Unnamed: 0,1,2,9,10,12,13,14,15,16,18,...,101060321,101927546,101927722,101928638,102724473,102724928,105375355,105378803,107403068,109731405
683665,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
684052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
684055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
684057,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
684059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
GDSCC.shape

(856, 24452)

In [22]:
GDSCC.head()

gene_id,1,2,9,10,12,13,14,15,16,18,...,107133486,107133502,107133524,107161145,107985535,107986809,107987337,107987341,109731405,112441434
683665,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
684052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-1,-1,0,0
684055,0,1,1,1,0,0,0,0,0,0,...,0,1,0,1,0,1,-1,-1,0,0
684057,-1,0,1,1,-1,0,0,0,-1,-1,...,-1,1,-1,0,-1,0,-1,-1,-1,-1
684059,0,0,0,0,0,1,0,0,0,1,...,1,0,-1,0,0,0,-1,-1,0,0


In [23]:
selector = VarianceThreshold(0.05)
selector.fit_transform(GDSCE)
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]

In [24]:
GDSCE.shape

(861, 16244)

In [25]:
GDSCE.head()

ENTREZID,1,2,9,10,12,13,14,15,16,18,...,100506548,100507117,100507254,100507436,100507472,100527978,100532746,100820829,102724473,105375355
683665,3.567759,3.44439,7.410196,2.901726,2.981935,2.807028,7.649455,2.829688,7.895537,3.102168,...,6.123125,3.383855,2.788156,5.850589,2.987299,2.686677,2.586782,2.949731,5.572678,2.960097
684052,4.031647,3.119876,5.348844,3.039942,2.826096,2.767429,8.136493,2.738326,9.482195,5.646555,...,6.426264,4.941407,2.880834,7.653155,3.062286,2.318554,2.840805,3.177785,3.865416,2.374161
684055,5.422951,3.289388,6.26694,2.99011,2.78832,2.614472,8.263573,2.336854,10.224379,5.981012,...,7.96842,3.090929,2.857835,5.78061,2.6148,2.99516,2.063917,2.724124,3.812784,2.655778
684057,4.706813,6.557659,5.506414,3.092901,3.016461,2.970122,8.513847,2.744526,8.400158,3.482531,...,8.037666,4.957676,3.211715,8.236083,2.799498,2.320755,2.259224,3.03471,3.892887,2.542773
684059,4.504268,4.843109,5.670954,2.978294,2.952328,3.317617,7.819232,2.642423,9.457529,3.797871,...,8.096685,4.448424,3.187923,7.290323,3.002865,2.837265,2.964874,2.668278,3.869515,2.719021


In [26]:
PDXC = PDXC.fillna(0)
PDXC[PDXC != 0.0] = 1
PDXM = PDXM.fillna(0)
PDXM[PDXM != 0.0] = 1
GDSCM = GDSCM.fillna(0)
GDSCM[GDSCM != 0.0] = 1
GDSCC = GDSCC.fillna(0)
GDSCC[GDSCC != 0.0] = 1

In [27]:
ls = GDSCE.columns.intersection(GDSCM.columns)
ls = ls.intersection(GDSCC.columns)
ls = ls.intersection(PDXE.columns)
ls = ls.intersection(PDXM.columns)
ls = ls.intersection(PDXC.columns)
ls2 = GDSCE.index.intersection(GDSCM.index)
ls2 = ls2.intersection(GDSCC.index)
ls3 = PDXE.index.intersection(PDXM.index)
ls3 = ls3.intersection(PDXC.index)
ls = pd.unique(ls)

In [28]:
PDXE = PDXE.loc[ls3,ls]
PDXM = PDXM.loc[ls3,ls]
PDXC = PDXC.loc[ls3,ls]
GDSCE = GDSCE.loc[ls2,ls]
GDSCM = GDSCM.loc[ls2,ls]
GDSCC = GDSCC.loc[ls2,ls]

In [29]:
# GDSCR.loc[GDSCR.iloc[:,0] == 'R'] = 0
# GDSCR.loc[GDSCR.iloc[:,0] == 'S'] = 1
# GDSCR.columns = ['targets']
# GDSCR = GDSCR.loc[ls2,:]

In [30]:
GDSCR.loc[GDSCR.iloc[:,0] == 'R'] = 0
GDSCR.loc[GDSCR.iloc[:,0] == 'S'] = 1
GDSCR.columns = ['targets', 'target', 'target', 'target', 'target', 'target']
GDSCR = GDSCR.loc[ls2,:]

In [31]:
GDSCR.head()

Unnamed: 0,targets,target,target.1,target.2,target.3,target.4
683665,0,0,0,0,0,0
684052,0,0,0,0,0,0
684055,1,1,1,1,1,1
684057,0,0,0,0,0,0
684059,0,0,0,0,0,0


In [32]:
ls_mb_size = [16, 32]
ls_h_dim = [1024, 512, 256]
ls_z_dim = [128, 64]
ls_marg = [0.5, 1, 1.5, 2, 2.5]
ls_lr = [0.001, 0.005, 0.0005, 0.0001,0.00005, 0.00001]
ls_epoch = [20, 50, 10, 15, 30, 90, 100]
ls_rate = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
ls_wd = [0.01, 0.001, 0.1, 0.0001]


In [33]:
Y = GDSCR['targets'].values

In [34]:
skf = StratifiedKFold(n_splits=7, random_state=42, shuffle = True)

In [35]:
GDSCE.values.shape

(856, 13348)

In [36]:
Y = Y.astype('int64')

In [37]:
type(Y[0])

numpy.int64

In [38]:
for iters in range(91,100):
    k = 0
    mbs = random.choice(ls_mb_size)
    hdm = random.choice(ls_h_dim)
    zdm = random.choice(ls_z_dim)
    lre = random.choice(ls_lr)
    lrm = random.choice(ls_lr)
    lrc = random.choice(ls_lr)
    lrCL = random.choice(ls_lr)
    epch = random.choice(ls_epoch)
    wd = random.choice(ls_wd)
    rate = random.choice(ls_rate)
    
    for train_index, test_index in skf.split(GDSCE.values, Y):
        k = k + 1
        X_trainE = GDSCE.values[train_index,:]
        X_testE =  GDSCE.values[test_index,:]
        X_trainM = GDSCM.values[train_index,:]
        X_testM = GDSCM.values[test_index,:]
        X_trainC = GDSCC.values[train_index,:]
        X_testC = GDSCM.values[test_index,:]
        y_trainE = Y[train_index]
        y_testE = Y[test_index]
        
        scalerGDSC = sk.StandardScaler()
        scalerGDSC.fit(X_trainE)
        X_trainE = scalerGDSC.transform(X_trainE)
        X_testE = scalerGDSC.transform(X_testE)

        X_trainM = np.nan_to_num(X_trainM)
        X_trainC = np.nan_to_num(X_trainC)
        X_testM = np.nan_to_num(X_testM)
        X_testC = np.nan_to_num(X_testC)
        
        TX_testE = torch.FloatTensor(X_testE)
        TX_testM = torch.FloatTensor(X_testM)
        TX_testC = torch.FloatTensor(X_testC)
        ty_testE = torch.FloatTensor(y_testE.astype(int))
        
        #Train
        class_sample_count = np.array([len(np.where(y_trainE==t)[0]) for t in np.unique(y_trainE)])
        weight = 1. / class_sample_count
        samples_weight = np.array([weight[t] for t in y_trainE])

        samples_weight = torch.from_numpy(samples_weight)
        sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight), replacement=True)

        mb_size = mbs

        trainDataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_trainE), torch.FloatTensor(X_trainM), 
                                                      torch.FloatTensor(X_trainC), torch.FloatTensor(y_trainE.astype(int)))

        trainLoader = torch.utils.data.DataLoader(dataset = trainDataset, batch_size=mb_size, shuffle=False, num_workers=1, sampler = sampler)

        n_sampE, IE_dim = X_trainE.shape
        n_sampM, IM_dim = X_trainM.shape
        n_sampC, IC_dim = X_trainC.shape

        h_dim = hdm
        Z_dim = zdm
        Z_in = h_dim + h_dim + h_dim
        lrE = lre
        lrM = lrm
        lrC = lrc
        epoch = epch

        costtr = []
        auctr = []
        costts = []
        aucts = []

        class AEE(nn.Module):
            def __init__(self):
                super(AEE, self).__init__()
                self.EnE = torch.nn.Sequential(
                    nn.Linear(IE_dim, h_dim),
                    nn.BatchNorm1d(h_dim),
                    nn.ReLU(),
                    nn.Dropout())
            def forward(self, x):
                output = self.EnE(x)
                return output

        class AEM(nn.Module):
            def __init__(self):
                super(AEM, self).__init__()
                self.EnM = torch.nn.Sequential(
                    nn.Linear(IM_dim, h_dim),
                    nn.BatchNorm1d(h_dim),
                    nn.ReLU(),
                    nn.Dropout())
            def forward(self, x):
                output = self.EnM(x)
                return output    


        class AEC(nn.Module):
            def __init__(self):
                super(AEC, self).__init__()
                self.EnC = torch.nn.Sequential(
                    nn.Linear(IM_dim, h_dim),
                    nn.BatchNorm1d(h_dim),
                    nn.ReLU(),
                    nn.Dropout())
            def forward(self, x):
                output = self.EnC(x)
                return output      
        class Classifier(nn.Module):
            def __init__(self):
                super(Classifier, self).__init__()
                self.FC = torch.nn.Sequential(
                    nn.Linear(Z_in, Z_dim),
                    nn.ReLU(),
                    nn.Dropout(rate),
                    nn.Linear(Z_dim, 1),
                    nn.Dropout(rate),
                    nn.Sigmoid())
            def forward(self, x):
                return self.FC(x)
        
        torch.cuda.manual_seed_all(42)

        AutoencoderE = AEE()
        AutoencoderM = AEM()
        AutoencoderC = AEC()

        solverE = optim.Adagrad(AutoencoderE.parameters(), lr=lrE)
        solverM = optim.Adagrad(AutoencoderM.parameters(), lr=lrM)
        solverC = optim.Adagrad(AutoencoderC.parameters(), lr=lrC)

        Clas = Classifier()
        SolverClass = optim.Adagrad(Clas.parameters(), lr=lrCL, weight_decay = wd)
        C_loss = torch.nn.BCELoss()

        for it in range(epoch):

            epoch_cost4 = 0
            epoch_cost3 = []
            num_minibatches = int(n_sampE / mb_size) 

            for i, (dataE, dataM, dataC, target) in enumerate(trainLoader):
                flag = 0
                AutoencoderE.train()
                AutoencoderM.train()
                AutoencoderC.train()
                Clas.train()
                if torch.mean(target)!=0. and torch.mean(target)!=1.:                      

                    ZEX = AutoencoderE(dataE)
                    ZMX = AutoencoderM(dataM)
                    ZCX = AutoencoderC(dataC)

                    ZT = torch.cat((ZEX, ZMX, ZCX), 1)
                    ZT = F.normalize(ZT, p=2, dim=0)

                    Pred = Clas(ZT)
                    loss = C_loss(Pred,target.view(-1,1))   

                    y_true = target.view(-1,1)
                    y_pred = Pred
                    AUC = roc_auc_score(y_true.detach().numpy(),y_pred.detach().numpy()) 

                    solverE.zero_grad()
                    solverM.zero_grad()
                    solverC.zero_grad()
                    SolverClass.zero_grad()

                    loss.backward()

                    solverE.step()
                    solverM.step()
                    solverC.step()
                    SolverClass.step()
                    
                    epoch_cost4 = epoch_cost4 + (loss / num_minibatches)
                    epoch_cost3.append(AUC)
                    flag = 1

            if flag == 1:
                costtr.append(torch.mean(epoch_cost4))
                auctr.append(np.mean(epoch_cost3))
                print('Iter-{}; Total loss: {:.4}'.format(it, loss))

            with torch.no_grad():

                AutoencoderE.eval()
                AutoencoderM.eval()
                AutoencoderC.eval()
                Clas.eval()

                ZET = AutoencoderE(TX_testE)
                ZMT = AutoencoderM(TX_testM)
                ZCT = AutoencoderC(TX_testC)

                ZTT = torch.cat((ZET, ZMT, ZCT), 1)
                ZTT = F.normalize(ZTT, p=2, dim=0)

                PredT = Clas(ZTT)
                lossT = C_loss(PredT,ty_testE.view(-1,1))         

                y_truet = ty_testE.view(-1,1)
                y_predt = PredT
                AUCt = roc_auc_score(y_truet.detach().numpy(),y_predt.detach().numpy())

                costts.append(lossT)
                aucts.append(AUCt)

        costtr_vals = []
        for item in costtr:
            costtr_vals.append(item.item())
        
        plt.plot(np.squeeze(costtr_vals), '-r',np.squeeze(costts), '-b')
        plt.ylabel('Total cost')
        plt.xlabel('iterations (per tens)')

        title = 'Cost Cetuximab iter = {}, fold = {}, mb_size = {},  hz_dim[1,2] = ({},{}), lr[E,M,C] = ({}, {}, {}), epoch = {}, wd = {}, lrCL = {}, rate4 = {}'.\
                      format(iters, k, mbs, hdm, zdm , lre, lrm, lrc, epch, wd, lrCL, rate)

        plt.suptitle(title)
        plt.savefig(save_results_to + title + '.png', dpi = 150)
        plt.close()

        plt.plot(np.squeeze(auctr), '-r',np.squeeze(aucts), '-b')
        plt.ylabel('AUC')
        plt.xlabel('iterations (per tens)')

        title = 'AUC Cetuximab iter = {}, fold = {}, mb_size = {},  hz_dim[1,2] = ({},{}), lr[E,M,C] = ({}, {}, {}), epoch = {}, wd = {}, lrCL = {}, rate4 = {}'.\
                      format(iters, k, mbs, hdm, zdm , lre, lrm, lrc, epch, wd, lrCL, rate)        

        plt.suptitle(title)
        plt.savefig(save_results_to + title + '.png', dpi = 150)
        plt.close()

Iter-0; Total loss: 0.6022
Iter-1; Total loss: 0.6023
Iter-2; Total loss: 0.5692
Iter-3; Total loss: 0.5309
Iter-4; Total loss: 0.5658
Iter-5; Total loss: 0.5861
Iter-6; Total loss: 0.5058
Iter-7; Total loss: 0.5867
Iter-8; Total loss: 0.5512
Iter-9; Total loss: 0.5505
Iter-10; Total loss: 0.5434
Iter-11; Total loss: 0.5452
Iter-12; Total loss: 0.4537
Iter-13; Total loss: 0.5061
Iter-14; Total loss: 0.4947
Iter-15; Total loss: 0.4752
Iter-16; Total loss: 0.5091
Iter-17; Total loss: 0.4896
Iter-18; Total loss: 0.4908
Iter-19; Total loss: 0.5225
Iter-20; Total loss: 0.4943
Iter-21; Total loss: 0.476
Iter-22; Total loss: 0.5435
Iter-23; Total loss: 0.4989
Iter-24; Total loss: 0.4387
Iter-25; Total loss: 0.4788
Iter-26; Total loss: 0.4951
Iter-27; Total loss: 0.5182
Iter-28; Total loss: 0.4915
Iter-29; Total loss: 0.4468
Iter-30; Total loss: 0.4384
Iter-31; Total loss: 0.4379
Iter-32; Total loss: 0.3981
Iter-33; Total loss: 0.4678
Iter-34; Total loss: 0.4296
Iter-35; Total loss: 0.4549
Ite