In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from lib import DihedralAdherence
from lib import PDBMineQuery
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from pathlib import Path
PDBMINE_URL = os.getenv("PDBMINE_URL")
PROJECT_DIR = 'tests'

In [5]:
proteins = ['T1024', 'T1096', 'T1027', 'T1082', 'T1091', 'T1058', 'T1049', 'T1030', 'T1056', 'T1038', 'T1025', 'T1028']

da = DihedralAdherence(proteins[6], [4,5,6,7], PDBMINE_URL, PROJECT_DIR)
da.load_results()



Initializing T1049 ...
Results already exist
Casp ID: T1049 	PDB: 6y4f
Structure exists: 'pdb/pdb6y4f.ent' 


In [15]:
import torch
from torch.utils.data import DataLoader, Dataset, ConcatDataset
from sklearn.model_selection import train_test_split

In [16]:
class ProteinDataset(Dataset):
    def __init__(self, id, path):
        self.id = id
        self.path = path

        self.X, self.y, self.xres, self.af = torch.load(self.path / f'{id}.pt')
    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        return self.X[i], self.xres[i], self.af[i], self.y[i]

In [17]:
lengths = [4096, 512, 256, 256]
path = Path('ml_samples/'+'-'.join([str(l) for l in lengths]))
samples = [f.stem for f in path.iterdir()]

from lib.retrieve_data import retrieve_target_list
ids = ['T1024', 'T1096', 'T1027', 'T1082', 'T1091', 'T1058', 'T1049', 'T1030', 'T1056', 'T1038', 'T1025', 'T1028']
targetlist = retrieve_target_list()
skip = [targetlist.loc[id, 'pdb_code'].upper() for id in ids]
samples = sorted(list(set(samples) - set(skip)))

train, test = train_test_split(samples, test_size=0.35, random_state=42)
torch.save((train, test), 'ml_data/split.pt')
# train, test = to ch.load('ml_data/split.pt')
train_dataset = ConcatDataset([ProteinDataset(s, path) for s in train])
test_dataset = ConcatDataset([ProteinDataset(s, path) for s in test])
trainloader = DataLoader(train_dataset, batch_size=512, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=512, shuffle=False)
len(train_dataset), len(test_dataset), len(train_dataset) + len(test_dataset)

(47572, 25179, 72751)

In [104]:
torch.inverse(torch.eye(2))

tensor([[1., 0.],
        [0., 1.]])

In [143]:
x = torch.tensor([1.0,2.0])
# x2 = torch.tensor([1,2]).unsqueeze(0).T
x @ torch.eye(2) @ x.T

# torch.tensor([1,2]).pow(2)

tensor(5.)

In [140]:
x @ (torch.eye(2) * 0.1) @ x

tensor(0.5000)

In [142]:
x.pow(2).sum() * 0.1

tensor(0.5000)

In [166]:
weights.shape

(5120,)

In [190]:
X,xres,af,y = train_dataset[0]

from scipy.stats import gaussian_kde
kdews = [1,1,1,1]
weights = np.concatenate([np.array([w]*l) for w,l in zip(kdews, lengths)])
# weights = weights / weights.sum()
# X = X.numpy()
# X
# plt.plot(weights)

# Kernel Density Estimation
h = 0.5
h_det = torch.det(torch.eye(4) * h)
def K(x):
    # 2 dimensional standard normal distribution
    return torch.exp(-0.5 * x.pow(2).sum() * h) / (2 * np.pi * h_det)
def kde(xi):
    if xi.ndim == 1:
        print(xi.shape)
        xi = xi.unsqueeze(1)
        print(xi.shape)
    likelihood = 0
    for w,x in zip(weights,X.T):
        likelihood += w * K(x - xi)
    return likelihood / weights.sum()
kde(torch.tensor([-1.81818182,  1.81818182]))

# Find most likely dihedral angles
phi_grid, psi_grid = np.meshgrid(np.linspace(-180, 180, 100), np.linspace(-180, 180, 100))
grid = np.vstack([phi_grid.ravel(), psi_grid.ravel()])
# probs = kde(grid).reshape(phi_grid.shape)
# kdepeak = grid[:,probs.argmax()]
# kdepeak, probs.max()

torch.Size([2])
torch.Size([2, 1])
(2, 10000)


In [184]:
kde = gaussian_kde(X, weights=weights)
phi_grid, psi_grid = np.meshgrid(np.linspace(-180, 180, 100), np.linspace(-180, 180, 100))
grid = np.vstack([phi_grid.ravel(), psi_grid.ravel()])
probs = kde(grid).reshape(phi_grid.shape)
kdepeak = grid[:,probs.argmax()]
kdepeak, probs.max()

(array([-1.81818182,  1.81818182]), 9.060125326719408e-05)