In [1]:
import sys
sys.executable

'C:\\ProgramData\\Anaconda3\\envs\\btc2\\python.exe'

In [2]:
import os
import argparse
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from utils import *
from base import ModelBase
import statistics as sta
from types import SimpleNamespace

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class GIT(ModelBase):

    def __init__(self, args, **kwargs):

        super(GIT, self).__init__(args, **kwargs)
        self.build()

    def build(self):

        self.layer_sga_emb = nn.Embedding(
            num_embeddings=self.sga_size+1,
            embedding_dim=self.embedding_size,
            padding_idx=0)

        self.layer_can_emb = nn.Embedding(
            num_embeddings=self.can_size+1,
            embedding_dim=self.embedding_size,
            padding_idx=0)

        self.layer_w_0 = nn.Linear(
            in_features=self.embedding_size,
            out_features=self.attention_size,
            bias=True)

        self.layer_beta = nn.Linear(
            in_features=self.attention_size,
            out_features=self.attention_head,
            bias=True)

        self.layer_dropout_1 = nn.Dropout(p=self.dropout_rate)

        self.layer_w_1 = nn.Linear(
            in_features=self.embedding_size,
            out_features=self.hidden_size,
            bias=True)

        self.layer_dropout_2 = nn.Dropout(p=self.dropout_rate)

        self.layer_w_2 = nn.Linear(
            in_features=self.hidden_size,
            out_features=self.deg_size,
            bias=True)

        if self.initializtion:
            gene_emb_pretrain = np.load(os.path.join(self.input_dir, "gene_emb_pretrain.npy"))
            self.layer_sga_emb.weight.data.copy_(torch.from_numpy(gene_emb_pretrain))

        self.optimizer = optim.Adam(
            self.parameters(),
            lr=self.learning_rate,
            weight_decay=self.weight_decay)


    def forward(self, sga_index, can_index):
   
        # cancer type embedding
        emb_can = self.layer_can_emb(can_index)
        emb_can = emb_can.view(-1,self.embedding_size)

        # gene embedings
        E_t = self.layer_sga_emb(sga_index)

        # squeeze and tanh-curve the gene embeddings
        E_t_flatten = E_t.view(-1, self.embedding_size)
        E_t1_flatten = torch.tanh( self.layer_w_0(E_t_flatten) )

        # multiplied by attention heads
        E_t2_flatten = self.layer_beta(E_t1_flatten)
        E_t2 = E_t2_flatten.view(-1, self.num_max_sga, self.attention_head)

        # normalize by softmax
        E_t2 = E_t2.permute(1,0,2)
        A = F.softmax(E_t2)
        A = A.permute(1,0,2)

        if self.attention:
          # multi-head attention weighted sga embedding:
            emb_sga = torch.sum( torch.bmm( A.permute(0,2,1), E_t ), dim=1)
            emb_sga = emb_sga.view(-1,self.embedding_size)
        else:
          # if not using attention, simply sum up SGA embeddings
            emb_sga = torch.sum(E_t, dim=1)
            emb_sga = emb_sga.view(-1, self.embedding_size)

        # if use cancer type input, add cancer type embedding
        if self.cancer_type:
            emb_tmr = emb_can+emb_sga
        else:
            emb_tmr = emb_sga

        # MLP decoder
        emb_tmr_relu = self.layer_dropout_1(emb_tmr)
        hid_tmr = self.layer_w_1(emb_tmr_relu)
        hid_tmr_relu = self.layer_dropout_2(hid_tmr)
        
        preds = F.tanh(self.layer_w_2(hid_tmr_relu))

        # attention weights
        attn_wt = torch.sum(A, dim=2)
        attn_wt = attn_wt.view(-1, self.num_max_sga)

        return preds, hid_tmr, emb_tmr, emb_sga, attn_wt


    def train(self, train_set, test_set,
            batch_size=None, test_batch_size=None,
            max_iter=None, max_fscore=None,
            test_inc_size=None, **kwargs):

        for iter_train in range(0, max_iter+1, batch_size):
            batch_set = get_minibatch(train_set, iter_train, batch_size,batch_type="train")
            preds, _, _, _, _ = self.forward(batch_set["sga"].to(device), batch_set["can"].to(device))
            labels = batch_set["deg"].to(device)

            self.optimizer.zero_grad()
            loss = -torch.log( self.epsilon + 1 - torch.abs(preds - labels) / 2 ).mean()
            loss.backward()
            self.optimizer.step()

            if test_inc_size and (iter_train % test_inc_size == 0):
                labels, preds, _, _, _, _, _ = self.test(test_set, test_batch_size)
                precision, recall, f1score, accuracy = evaluate(
                    labels, preds, epsilon=self.epsilon)
                print("[%d,%d], precision: %.3f, acc: %.3f"% (iter_train//len(train_set["can"]),
                      iter_train%len(train_set["can"]), precision, accuracy))

                if f1score >= max_fscore:
                    break

        #self.save_model(os.path.join(self.output_dir, "trained_model.pth"))


    def test(self, test_set, test_batch_size, **kwargs):

        labels, preds, hid_tmr, emb_tmr, emb_sga, attn_wt, tmr = [], [], [], [], [], [], []

        for iter_test in range(0, len(test_set["can"]), test_batch_size):
            batch_set = get_minibatch(test_set, iter_test, test_batch_size, batch_type="test")
            batch_preds, batch_hid_tmr, batch_emb_tmr, batch_emb_sga, batch_attn_wt = self.forward(
                batch_set["sga"].to(device), batch_set["can"].to(device))
            batch_labels = batch_set["deg"].to(device)

            labels.append(batch_labels.data.to(torch.device("cpu")).numpy())
            preds.append(batch_preds.data.to(torch.device("cpu")).numpy())
            hid_tmr.append(batch_hid_tmr.data.to(torch.device("cpu")).numpy())
            emb_tmr.append(batch_emb_tmr.data.to(torch.device("cpu")).numpy())
            emb_sga.append(batch_emb_sga.data.to(torch.device("cpu")).numpy())
            attn_wt.append(batch_attn_wt.data.to(torch.device("cpu")).numpy())
            tmr = tmr + batch_set["tmr"]

        labels = np.concatenate(labels,axis=0)
        preds = np.concatenate(preds,axis=0)
        hid_tmr = np.concatenate(hid_tmr,axis=0)
        emb_tmr = np.concatenate(emb_tmr,axis=0)
        emb_sga = np.concatenate(emb_sga,axis=0)
        attn_wt = np.concatenate(attn_wt,axis=0)

        return labels, preds, hid_tmr, emb_tmr, emb_sga, attn_wt, tmr


### Non Binary Target

In [7]:
# Parse arguments
args_nb = SimpleNamespace()

args_nb.train_model=True

args_nb.input_dir="data_noBin"
args_nb.output_dir="data_noBin"

args_nb.embedding_size=512
args_nb.hidden_size=1024
args_nb.attention_size=400
args_nb.attention_head=128

args_nb.max_fscore=0.7
args_nb.batch_size=16
args_nb.test_batch_size=512
args_nb.test_inc_size=256
args_nb.dropout_rate=0.5
args_nb.weight_decay=1e-5

args_nb.deg_shuffle=False
args_nb.nonbinary=True

# Load data
dataset_nb = load_dataset(input_dir=args_nb.input_dir, deg_shuffle=args_nb.deg_shuffle)
train_set_nb, test_set_nb = split_dataset(dataset_nb, ratio=0.66)

args_nb.can_size = dataset_nb["can"].max()        # cancer type dimension
args_nb.sga_size = max(dataset_nb["sga"].max(), 19781)        # SGA dimension
args_nb.deg_size = dataset_nb["deg"].shape[1]     # DEG output dimension
args_nb.num_max_sga = dataset_nb["sga"].shape[1]  # maximum number of SGAs in a tumor

In [None]:
precision_nb_GIT, recall_nb_GIT, f1score_nb_GIT, accuracy_nb_GIT = [], [], [], []

# GIT variants:
# args.initializtion=False -> GIT-init
args_nb.initializtion=True
# args.attention=False -> GIT-attn
args_nb.attention=True
# args.cancer_type=False -> GIT-can
args_nb.cancer_type=True

args_nb.max_iter=3072*20
args_nb.learning_rate=1e-4

if args_nb.cancer_type == False:
    args_nb.max_iter = 3072*40
elif args_nb.attention == False:
    args_nb.max_iter = 3072*40
    args_nb.learning_rate = 0.0003

for i in range(5):
    
    # Init model with single hidden layer
    model = GIT(args_nb).to(device)

    # Train MLP model
    model.train(train_set_nb, test_set_nb,
          batch_size=args_nb.batch_size,
          test_batch_size=args_nb.test_batch_size,
          max_iter=args_nb.max_iter,
          max_fscore=args_nb.max_fscore,
          test_inc_size=args_nb.test_inc_size)

    print("Evaluating...")
    labels, preds, _, _, _, _, _ = model.test(test_set_nb, test_batch_size=512)
    precision, recall, f1score, accuracy = evaluate(labels, preds, epsilon=1e-4)
    print("prec=%.3f, recall=%.3f, F1=%.3f, acc=%.3f"%(precision, recall, f1score, accuracy))
    
    # Release memory
    del model
    torch.cuda.empty_cache()
    
    precision_nb_GIT.append(precision)
    recall_nb_GIT.append(recall)
    f1score_nb_GIT.append(f1score)
    accuracy_nb_GIT.append(accuracy)



[0,0], precision: 0.168, acc: 0.347
[0,256], precision: 0.236, acc: 0.393
[0,512], precision: 0.282, acc: 0.411
[0,768], precision: 0.306, acc: 0.417
[0,1024], precision: 0.315, acc: 0.420
[0,1280], precision: 0.319, acc: 0.428
[0,1536], precision: 0.322, acc: 0.443
[0,1792], precision: 0.325, acc: 0.460
[0,2048], precision: 0.328, acc: 0.480
[0,2304], precision: 0.333, acc: 0.498
[0,2560], precision: 0.339, acc: 0.515
[0,2816], precision: 0.346, acc: 0.532
[1,124], precision: 0.353, acc: 0.547
[1,380], precision: 0.362, acc: 0.558
[1,636], precision: 0.370, acc: 0.567
[1,892], precision: 0.377, acc: 0.574
[1,1148], precision: 0.384, acc: 0.578
[1,1404], precision: 0.389, acc: 0.585
[1,1660], precision: 0.395, acc: 0.593
[1,1916], precision: 0.400, acc: 0.599
[1,2172], precision: 0.405, acc: 0.602
[1,2428], precision: 0.408, acc: 0.605
[1,2684], precision: 0.414, acc: 0.609
[1,2940], precision: 0.419, acc: 0.617
[2,248], precision: 0.421, acc: 0.619
[2,504], precision: 0.425, acc: 0.62

[18,696], precision: 0.697, acc: 0.781
[18,952], precision: 0.691, acc: 0.780
[18,1208], precision: 0.692, acc: 0.780
[18,1464], precision: 0.696, acc: 0.780
[18,1720], precision: 0.699, acc: 0.781
[18,1976], precision: 0.698, acc: 0.781
[18,2232], precision: 0.696, acc: 0.781
[18,2488], precision: 0.690, acc: 0.780
[18,2744], precision: 0.696, acc: 0.781
[19,52], precision: 0.698, acc: 0.781
[19,308], precision: 0.690, acc: 0.780
[19,564], precision: 0.699, acc: 0.782
[19,820], precision: 0.699, acc: 0.781
[19,1076], precision: 0.695, acc: 0.781


In [None]:
print("prec=%.3f, recall=%.3f, F1=%.3f, acc=%.3f"%
      (sta.mean(precision_nb_GIT), sta.mean(recall_nb_GIT), sta.mean(f1score_nb_GIT), sta.mean(accuracy_nb_GIT)))

In [None]:
#prec=0.696, recall=0.537, F1=0.606, acc=0.781


In [16]:
np.around(preds)

array([[-0., -0., -0., ..., -0., -0.,  1.],
       [ 1., -0., -0., ..., -0., -1., -0.],
       [-0., -0.,  1., ..., -0., -1.,  1.],
       ...,
       [-1., -0., -1., ..., -0., -0., -0.],
       [-0.,  0.,  0., ...,  1.,  0., -0.],
       [ 0.,  1.,  0., ...,  0., -0.,  1.]], dtype=float32)