# Results


In [20]:
import os

import numpy as np
import scipy
import torch
from torch.utils.data import DataLoader
from torchinfo import summary
from codes.pt_data import ProteinLigand_3DDataset
from codes.raw_data import RawDataset

## Setup

In [21]:
# model
model_path= f"{os.getenv('luh_ALL_CCFRSCRATCH')}/proli/models"
model_name= "rotations_2.7341_0.pth"

# data
input_dir= f"{os.path.join(os.getenv('luh_ALL_CCFRWORK'))}/deep_learning/pafnucy/data/CoG_12" 
grid_spacing= 1.0 # distance between grid points in angstrom
max_dist= 12 # max distance from complex center
batch_size= 50

# normalisation (used during training)
partialcharge= {"m": -0.1401471346616745, "std": 0.4216829240322113}

## Data

In [22]:
def get_data(name, input_dir, max_dist, partialcharge, grid_spacing, batch_size):
    raw_data = RawDataset(input_dir, name, max_dist)
    raw_data.load_data()
    raw_data.set_normalization_params(partialcharge["m"], partialcharge["std"])
    raw_data.charge_normalization()
    print(raw_data)

    dataset = ProteinLigand_3DDataset(raw_data, grid_spacing=grid_spacing, rotations=None)
    no_of_samples = len(dataset)
    batch_size = min(no_of_samples, batch_size)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True,
                                 persistent_workers=True)    
    return dataloader, no_of_samples


train_dl, train_samples= get_data('training', input_dir, max_dist, partialcharge, grid_spacing, batch_size)
val_dl, val_samples= get_data('validation', input_dir, max_dist, partialcharge, grid_spacing, batch_size)
test_dl, test_samples= get_data('test', input_dir, max_dist, partialcharge, grid_spacing, batch_size)


training dataset with 13800 samples
	Partial charge normalization: m= -0.1401471346616745                     std= 0.4216829240322113

validation dataset with 3479 samples
	Partial charge normalization: m= -0.1401471346616745                     std= 0.4216829240322113

test dataset with 285 samples
	Partial charge normalization: m= -0.1401471346616745                     std= 0.4216829240322113



## Model

> *remove "summary()" and "import torchinfo" if you don't want to install the package

In [23]:
model_fullname = os.path.join(model_path, model_name)
model = torch.load(model_fullname)

summary(model, input_size=(10, 19, 25, 25, 25))

Layer (type:depth-idx)                   Output Shape              Param #
Pafnucy                                  --                        --
├─Sequential: 1-1                        [10, 256, 4, 4, 4]        --
│    └─Conv3d: 2-1                       [10, 64, 25, 25, 25]      152,064
│    └─ReLU: 2-2                         [10, 64, 25, 25, 25]      --
│    └─MaxPool3d: 2-3                    [10, 64, 13, 13, 13]      --
│    └─Conv3d: 2-4                       [10, 128, 13, 13, 13]     1,024,128
│    └─ReLU: 2-5                         [10, 128, 13, 13, 13]     --
│    └─MaxPool3d: 2-6                    [10, 128, 7, 7, 7]        --
│    └─Conv3d: 2-7                       [10, 256, 7, 7, 7]        4,096,256
│    └─ReLU: 2-8                         [10, 256, 7, 7, 7]        --
│    └─MaxPool3d: 2-9                    [10, 256, 4, 4, 4]        --
├─Sequential: 1-2                        [10, 1]                   --
│    └─Linear: 2-10                      [10, 1000]               

## Run model on dataset

In [24]:
def run(model, dataloader, no_of_samples):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.eval() # model already on gpu
    affinities = np.empty(0, dtype=np.float32)
    predictions = []

    for (inputs, labels) in dataloader:
        inputs = inputs.to(device)

        with torch.set_grad_enabled(False):
            preds = model(inputs)

        affinities = np.append(affinities, labels.numpy())
        predictions = np.append(predictions, preds.cpu().detach().numpy())

    print(f"Computed preds on {len(predictions)}/{len(affinities)} samples! (expected: {no_of_samples})")
    return affinities, predictions

In [26]:
affinities, predictions = {}, {}

affinities["test"], predictions["test"] = run(model, test_dl, test_samples)
affinities["val"], predictions["val"] = run(model, val_dl, val_samples)
affinities["train"], predictions["train"] = run(model, train_dl, train_samples)

Computed preds on 285/285 samples! (expected: 285)
Computed preds on 3479/3479 samples! (expected: 3479)
Computed preds on 13800/13800 samples! (expected: 13800)


## Analysis


In [34]:
def analyse(affinities, predictions, name):
    rmse = ((predictions - affinities) ** 2).mean() ** 0.5
    mae = (np.abs(predictions - affinities)).mean()
    #corr = scipy.stats.pearsonr(predictions, affinities)
    # lr = LinearRegression()
    # lr.fit(predictions, affinities)
    # y_ = lr.predict(predictions)
    # sd = (((affinities - y_) ** 2).sum() / (len(affinities) - 1)) ** 0.5
    
    print(f"""
    Analysis of {name} data:
        rmse= {rmse}
        mae= {mae}
        corr= {corr}
    """)

In [35]:
for name in ["train", "val", "test"]:
    analyse(affinities[name], predictions[name], name)


    Analysis of train data:
        rmse= 1.5931353063688671
        mae= 1.2697432543855647
        corr= corr
    

    Analysis of val data:
        rmse= 1.6535175507352018
        mae= 1.3303013406426236
        corr= corr
    

    Analysis of test data:
        rmse= 1.789591643521702
        mae= 1.4467452777059455
        corr= corr
    


In [31]:
print(scipy.__version__)

1.7.1
