In [2]:
import pandas as pd
import numpy as np
import torch
import os
# os.chdir('/home/jiageng/Documents/fhr/pipeline/')
os.chdir('/home/jiageng/Documents/fhr/pygcn/pygcn')
import snf

In [3]:
def stdNormalize(df):
    std = df.std().fillna(1)
    mean = df - df.mean()
    df_norm = mean / std
    return np.array(df_norm)

In [4]:
def rowNormalize(mx):
    """Row-normalize matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = np.diag(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

In [5]:
from scipy.sparse import coo_matrix

def sparse_tensor_from_arr(arr):
    """Convert numpy array to torch sparse tensor"""
    # Convert numpy array to scipy sparse matrix
    sparse_sp = coo_matrix(arr)
    
    # Convert scipy sparse matrix to torch sparse tensor
    sparse_tensor = torch.sparse_coo_tensor(
        torch.LongTensor([sparse_sp.row, sparse_sp.col]),
        torch.FloatTensor(sparse_sp.data),
        torch.Size(sparse_sp.shape)
    )
    
    return sparse_tensor

Prepare labels

In [6]:
df_labels = pd.read_csv('/home/jiageng/Documents/fhr/annotations/fhr-annotations.tsv',sep='\t').set_index('PUBLIC_ID').query('risk != -1')
df_labels['risk'] += 1

Prepare features

In [7]:
# subset to samples with fhr labels
data_full = pd.read_csv('/home/jiageng/Documents/fhr/matrices/gene_mut_matrix_gt1.tsv.gz',sep='\t').set_index('PUBLIC_ID')
if 'SAMPLE' in data_full.columns:
    data_full = data_full.drop(columns=['SAMPLE'])

In [347]:
data = pd.read_csv('/home/jiageng/Documents/fhr/matrices/gene_mut_matrix_fdr5e2.tsv',sep='\t').set_index('PUBLIC_ID')
print(data.shape)

(974, 10)


Use all samples with features, handle the missing labels

In [168]:
# use all samples
public_ids = data.index
print(len(public_ids))

974


In [356]:
feature_norm_mtd = 'stdnorm' 

if feature_norm_mtd == 'stdnorm':
    features = torch.tensor(stdNormalize(data.loc[public_ids]),dtype=torch.float32)
elif feature_norm_mtd == 'rownorm':
    features = torch.tensor(rowNormalize(data.loc[public_ids].astype(float).values),dtype=torch.float32)
else:
    # default is to use raw log tpm+1 values
    features = torch.tensor(data.loc[public_ids].values,dtype=torch.float32)
print(features.size())

torch.Size([974, 10])


In [350]:
risk = df_labels.reindex(pd.Index(public_ids)).loc[public_ids]['risk'].fillna(-1).astype(int)
labels = torch.tensor(risk.values,dtype=torch.long)

In [351]:
# missing values are set to -1
risk.value_counts()

risk
 1    517
 2    259
-1    107
 3     91
Name: count, dtype: int64

In [352]:
idx_labeled = np.where(risk != -1)[0]
idx_unlabeled = np.where(risk == -1)[0]
idx_val = torch.tensor(idx_labeled[::3])
idx_test = idx_val
idx_train = torch.tensor([idx for idx in idx_labeled if idx not in idx_val])
print('Un-labeled',len(idx_unlabeled))
print('Labeled train',len(idx_train))
print('Labeled val',len(idx_val))

Un-labeled 107
Labeled train 578
Labeled val 289


In [13]:
# 1 = SR, 2 = GHR, 3 = FHR
risk.iloc[idx_train].value_counts()

risk
1    333
2    181
3     64
Name: count, dtype: int64

In [14]:
# 1 = SR, 2 = GHR, 3 = FHR
# idx_val and idx_test are the same for now
risk.iloc[idx_val].value_counts()

risk
1    184
2     78
3     27
Name: count, dtype: int64

Prepare adjacency matrix

In [353]:
import snf
aff = snf.make_affinity(stdNormalize(data), metric='euclidean', K=1000, mu=0.5)
print(aff.shape)
print(aff.max())
print(aff.min())

(974, 974)
1.7855952252508438
0.0004711639034988262




Increase the sparsity of the matrix by 0-clipping small values

The sparser the matrix, the higher accuracy of the GCN

In [399]:
# Method 2 - sparsen after row-normalization
# this is the method that works for mutations
adj = rowNormalize(aff)
pctile=0.95
threshold = np.quantile(adj,pctile)
print(pctile, threshold)
adj[adj < threshold] = 0
adj = sparse_tensor_from_arr(adj)

0.95 0.0011096332200402708


# Train Model

In [404]:
import time
import argparse
import torch
import torch.nn.functional as F
import torch.optim as optim

from pygcn.utils import load_data, accuracy
from pygcn.models import GCN
parser = argparse.ArgumentParser()
parser.add_argument('--no-cuda', action='store_true', default=False,
                    help='Disables CUDA training.')
parser.add_argument('--fastmode', action='store_true', default=False,
                    help='Validate during training pass.')
parser.add_argument('--seed', type=int, default=42, help='Random seed.')
parser.add_argument('--epochs', type=int, default=200,
                    help='Number of epochs to train.')
parser.add_argument('--lr', type=float, default=0.01,
                    help='Initial learning rate.')
parser.add_argument('--weight_decay', type=float, default=5e-4,
                    help='Weight decay (L2 loss on parameters).')
parser.add_argument('--hidden', type=int, default=16,
                    help='Number of hidden units.')
parser.add_argument('--dropout', type=float, default=0.5,
                    help='Dropout rate (1 - keep probability).')

args = parser.parse_args(['--epochs=200','--dropout=0.','--hidden=16','--weight_decay=5e-4','--seed=1023420948521123'])
args.cuda = not args.no_cuda and torch.cuda.is_available()
model = GCN(nfeat=features.shape[1],
            nhid=args.hidden,
            nclass=labels.max().item() + 1,
            dropout=args.dropout)
optimizer = optim.Adam(model.parameters(),
                       lr=args.lr, weight_decay=args.weight_decay)
def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    loss_train.backward()
    optimizer.step()

    if not args.fastmode:
        # Evaluate validation set performance separately,
        # deactivates dropout during validation run.
        model.eval()
        output = model(features, adj)

    loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    acc_val = accuracy(output[idx_val], labels[idx_val])
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'loss_val: {:.4f}'.format(loss_val.item()),
          'acc_val: {:.4f}'.format(acc_val.item()),
          'time: {:.4f}s'.format(time.time() - t))


def test():
    model.eval()
    output = model(features, adj)
    loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output[idx_test], labels[idx_test])
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test.item()))

# Train model
t_total = time.time()
for epoch in range(args.epochs):
    train(epoch)
print("Optimization Finished!")
print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

# Testing
test()

Epoch: 0001 loss_train: 1.2472 acc_train: 0.3080 loss_val: 1.2084 acc_val: 0.2837 time: 0.0124s
Epoch: 0002 loss_train: 1.2141 acc_train: 0.3149 loss_val: 1.1752 acc_val: 0.2837 time: 0.0123s
Epoch: 0003 loss_train: 1.1847 acc_train: 0.3149 loss_val: 1.1445 acc_val: 0.2837 time: 0.0073s
Epoch: 0004 loss_train: 1.1577 acc_train: 0.3149 loss_val: 1.1156 acc_val: 0.2837 time: 0.0054s
Epoch: 0005 loss_train: 1.1326 acc_train: 0.3149 loss_val: 1.0884 acc_val: 0.6955 time: 0.0037s
Epoch: 0006 loss_train: 1.1089 acc_train: 0.6315 loss_val: 1.0631 acc_val: 0.6955 time: 0.0051s
Epoch: 0007 loss_train: 1.0871 acc_train: 0.6315 loss_val: 1.0397 acc_val: 0.6955 time: 0.0070s
Epoch: 0008 loss_train: 1.0670 acc_train: 0.6315 loss_val: 1.0169 acc_val: 0.7059 time: 0.0037s
Epoch: 0009 loss_train: 1.0477 acc_train: 0.6436 loss_val: 0.9949 acc_val: 0.7059 time: 0.0039s
Epoch: 0010 loss_train: 1.0293 acc_train: 0.6453 loss_val: 0.9740 acc_val: 0.7059 time: 0.0043s
Epoch: 0011 loss_train: 1.0121 acc_train

In [401]:
# negative control - only consider self values
adj = torch.eye(adj.size(0)).to_sparse()

## For 85 genes (chi2 p < 0.05 and freq >= 10), less TP53

Feature matrix should be row-normalized

Affinity network needs high K e.g. K=1000, because in case neighbours are exactly the same

For gene mutation matrix, a low percentile filter for sparsity e.g. 0.5 is better.

Also, the sparsity filter should only be applied after row normalization

Accuracy is slightly above 0.7.  Using identity adjacent matrix cannot get this high.

Very high dropout of 0.9-0.95 is needed, otherwise model will overfit to training data

## For 10 genes (FDR 0.05)

Accuracy of identity adjacency matrix is consistently above 0.7 when dropout is 0.5. 

Accuracy is very slightly above 0.7 but is not any better 

Conclusion 10 features is too few for the benefits of GCN to be realised