## Benchmark experiment on Deep Neural Network based PRSs
This notebook collects all the code needed to run the benchmark experiment on the algorithm named DNN-Badre in *Massi, Franco et al., Learning High-Order Interactions for Polygenic Risk Prediction (2022)*.

The original paper by Badre et al. can be accessed from here: https://www.nature.com/articles/s10038-020-00832-7. The neural network model is implemented according to the specifics thereby detailed.

*Remark* aside from the hiprs package, **this notebook requires the Pytorch library** to be executed.

In [1]:
import torch # Pytorch library for building DNN models
from hiprs import snps # Auxiliary library for data simulation

In [2]:
# Data generation
seed = 5 # Refers to the simulated dataset analyzed in the paper for the purpose of model interpretability (cf. Fig. 5)
#In general, seeds 0 to 29 correspond to the 30 datasets analyzed in the paper. 

ntrain, ntest = 1000, 500 # Number of observations
p = 15 # Number of SNPs

dataset = snps.generate(n = ntrain + ntest, p = p, noise = 0.01, seed = seed) 
dataset

Unnamed: 0,SNP1,SNP2,SNP3,SNP4,SNP5,SNP6,SNP7,SNP8,SNP9,SNP10,SNP11,SNP12,SNP13,SNP14,SNP15,Outcome
0,1,1,0,0,1,2,2,2,1,1,0,2,1,0,0,0
1,1,1,0,2,2,2,1,1,1,1,0,0,0,0,1,1
2,0,0,0,2,0,2,0,2,2,0,1,2,0,0,0,0
3,1,1,0,2,2,2,1,0,0,0,1,0,2,2,2,1
4,2,2,0,0,2,0,1,0,2,2,0,1,1,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1,1,0,0,2,2,2,0,0,2,1,2,1,2,2,0
1496,2,2,0,0,0,2,0,2,1,0,0,1,2,1,1,0
1497,1,1,0,1,1,0,0,2,2,0,2,1,1,2,0,1
1498,2,2,0,1,1,2,1,2,0,1,2,2,0,0,2,0


**Auxiliary functions and classes for handling DNNs**

In [4]:
from hiprs.scores import Classifier, Clock

class DNN(Classifier):
    def __init__(self, model):
        self.dnn = model
        self.time = 0
    def predict(self, x):
        with torch.no_grad():
            return self.dnn(x).view(-1).cpu().numpy()
    def fittingtime(self):
        return self.time

In [None]:
from random import shuffle
import numpy

# Cross-Entropy loss
def lossfunction(ypredicted, ytrue):
    return -(ytrue*(ypredicted+1e-10).log() + (1.0-ytrue)*(1.0-ypredicted+1e-10).log()).sum()

# For model evaluation
def error(ypredicted, ytrue):    
    return (ypredicted - ytrue).abs().mean() # L1 error

 # Trains a model on the given dataset. NB: assumes the covariates are in the first columns while the target is in the last one
def train(model, train_data, test_data, lossf, optim, lr, epochs, minibatches = None):    
    dnn = model.dnn
    ntrain, ntest = len(train_data), len(test_data)               

    if(minibatches == None):
        minibatches = ntrain

    optimizer = optim(dnn.parameters(), lr = lr)

    def feedback(epoch, mret, mrev):
        clear_output(wait=True)
        print("%s\nEpoch\tTrain Error\tTest Error" % extra)
        print("%d\t%.2e\t%.2e" % (epoch, mret, mrev))

    timer = Clock()
    indexes = list(numpy.arange(ntrain))
    timer.start()

    terrors = []
    verrors = []

    for e in range(epochs):
        shuffle(indexes)
        batches = [indexes[(i*minibatches):((i+1)*minibatches)] for i in range(ntrain//minibatches)]

        with torch.no_grad():
            mret = error(dnn(train_data[:,:-1]), train_data[:,[-1]]).item() # Training error
            mrev = error(dnn(test_data[:,:-1]) , test_data[:,[-1]] ).item() # Test error
            terrors.append(mret)
            verrors.append(mrev)
            feedback(e, mret, mrev)

        for minibatch in batches:
            x = train_data[minibatch, :-1]
            y = train_data[minibatch,[-1]]
            def closure():
                optimizer.zero_grad()
                loss = lossf(dnn(x), y)
                loss.backward()
                return loss
            optimizer.step(closure)

    timer.stop()
    with torch.no_grad():
        mret = error(dnn(train_data[:,:-1]), train_data[:,[-1]]).item() # Training error
        mrev = error(dnn(test_data[:,:-1]) , test_data[:,[-1]] ).item() # Test error
        terrors.append(mret)
        verrors.append(mrev)
        feedback(e, mret, mrev)
    print("Training complete. Elapsed time: %s." % (timer.elapsedTime()))
    model.time = timer.elapsed()

**Model fitting and results**

In [None]:
# Training and test splitting
ntrain, ntest = 1000, 500
tdata, vdata = data.iloc[:ntrain,:], data.iloc[ntrain:,:]

# SNPs data are dummified to allow a proper DNN elaboration of the input
train_data = pd.get_dummies(tdata.astype('category')).values
test_data = pd.get_dummies(vdata.astype('category')).values
mask = [True]*train_data.shape[1]
mask[-2] = False

# Trasfering data over GPU for faster computations
train_data = torch.tensor(train_data[:,mask], dtype = torch.float, device = torch.device("cuda:0"))
test_data  = torch.tensor(test_data[:,mask],  dtype = torch.float, device = torch.device("cuda:0"))

# Deep Neural Network construction
dnn = torch.nn.Sequential(torch.nn.Linear(train_data.shape[1]-1, 100),
                          torch.nn.LeakyReLU(0.2),
                          torch.nn.Linear(100, 25),
                          torch.nn.LeakyReLU(0.2),
                          torch.nn.Linear(25, 5),
                          torch.nn.LeakyReLU(0.2),
                          torch.nn.Linear(5, 1),
                          torch.nn.sigmoid())

# He initialization of the network model
for layer in dnn:
    if(isinstance(layer, torch.nn.Linear)):
        with torch.no_grad():
            torch.nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='leaky_relu', a = 0.2)

# Trasfering the DNN model over GPU
dnn.cuda()

badre = DNN(dnn)

# Optimization
train(badre, train_data, test_data, lossfunction, torch.optim.Adam, lr = 1e-3, minibatches = 10, epochs = 200)

auc  = badre.auc(test_data[:,:-1], test_data[:,-1].cpu().numpy())
time = badre.fittingtime()

print("Model trained. Elapsed time: %.2f seconds." % time)
print("AUC (test set): %.2f." % auc)