In [None]:
import os
import numpy as np
import torch as pt
import pandas as pd
import blosum as bl
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
from matplotlib import rcParams

import src as sp
from theme import colors

# font parameters
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.size'] = 12

In [None]:
def max_pred_to_seq(p):
    return ''.join([sp.res3to1[r] for r in sp.std_resnames[:20][pt.argmax(p,dim=1).cpu().numpy()]])

def sequence_identity(seq_ref, seq):
    return np.mean(np.array(list(seq_ref)) == np.array(list(seq)))

def sequence_similarity(seq_ref, seq):
    blm = bl.BLOSUM(62)
    return np.mean(np.array([blm[si][sj] for si,sj in zip(seq_ref,seq)]) > 0)

def max_seq_recovery(p, y):
    msr = pt.mean((p[pt.arange(y.shape[0]), pt.argmax(y, dim=1)] >= 0.5).float())
    mpr = pt.mean(pt.sum((p >= 0.5).float(), dim=1))
    return msr, mpr

In [None]:
# reload precomputed results
data = {}
for fp in glob("results/*.pt"):
    key = os.path.basename(fp).split('.')[0]
    Z = pt.load(fp)
    data[key] = (Z[1:], Z[0])

In [None]:
# analyse all predictions
results = []
for key in data:
    # unpack results
    P, y = data[key]
    
    # get sequences
    seq_ref = max_pred_to_seq(y)
    seqs = [max_pred_to_seq(p) for p in P]
    
    # compute scores
    si = np.array([sequence_identity(seq_ref, seq) for seq in seqs])
    ss = np.array([sequence_similarity(seq_ref, seq) for seq in seqs])
    
    # sequence sampling prediction
    seq_sampling = {
        'mean': max_pred_to_seq(pt.mean(P, dim=0)),
        'median': max_pred_to_seq(pt.median(P, dim=0)[0]),
        'max': max_pred_to_seq(pt.max(P, dim=0)[0]),
        'min': max_pred_to_seq(pt.min(P, dim=0)[0]),
        'argmax': max_pred_to_seq(P[pt.argmax(pt.mean(pt.max(P, dim=2)[0], dim=1))]),
        'gmean': max_pred_to_seq(pt.exp(pt.mean(pt.log(P), dim=0))),
    }

    # maximum sequence recovery with different sampling methods
    msr = pt.tensor([max_seq_recovery(p,y) for p in P]).numpy()
    msr_sampling = {
        'mean': pt.tensor(max_seq_recovery(pt.mean(P, dim=0), y)).numpy(),
        'median': pt.tensor(max_seq_recovery(pt.median(P, dim=0)[0], y)).numpy(),
        'argmax': pt.tensor(max_seq_recovery(P[pt.argmax(pt.mean(pt.max(P, dim=2)[0], dim=1))], y)).numpy(),
        'gmean': pt.tensor(max_seq_recovery(pt.exp(pt.mean(pt.log(P), dim=0)), y)).numpy(),
    }
    
    # store results
    results.append({
        'pdbid': key.split(':')[0],
        'mdid': key.split(':')[1],
        'ref_si': si[0],
        'ref_ss': ss[0],
        'mean_si': np.mean(si),
        'mean_ss': np.mean(ss),
        'max_si': np.max(si),
        'max_ss': np.max(ss),
        'ref_msr': msr[0,0],
        'ref_sm': msr[0,1],
        'mean_msr': np.mean(msr, axis=0)[0],
        'mean_sm': np.mean(msr, axis=0)[1],
    })
    
    for name in seq_sampling:
        results[-1]['{}_p_si'.format(name)] = sequence_identity(seq_ref, seq_sampling[name])
        results[-1]['{}_p_ss'.format(name)] = sequence_similarity(seq_ref, seq_sampling[name])

    for name in msr_sampling:
        results[-1]['{}_p_msr'.format(name)] = msr_sampling[name][0]
        results[-1]['{}_p_sm'.format(name)] = msr_sampling[name][1]

# pack results
df = pd.DataFrame(results)

In [None]:
df.round(3)

In [None]:
df.mean().round(3)

In [None]:
x = 1e2*df['ref_si'].values
y = 1e2*df['mean_p_si'].values

plt.figure(figsize=(3,3))
plt.plot(x, y, '.', ms=4.0, color=colors[0])
plt.plot([0,100],[0,100], 'k-', alpha=0.5)
plt.xlim(10, 90)
plt.ylim(10, 90)
plt.xlabel('reference recovery rate [%]')
plt.ylabel('MD recovery rate [%]')
plt.tight_layout()
plt.savefig("graphs/ref_v_md_recovery_rate.svg")
plt.show()

In [None]:
V0 = df[['ref_sm', 'ref_msr']].values
V1 = df[['mean_p_sm', 'mean_p_msr']].values

#plt.figure(figsize=(3.1,3))
plt.figure(figsize=(4,3))
plt.plot(V0[:,0], 1e2*V0[:,1], '.', ms=6.0, color=colors[1], label="reference")
plt.plot(V1[:,0], 1e2*V1[:,1], '.', ms=6.0, color=colors[0], label="average prediction")
plt.xlim(1.5, 6.5)
plt.ylim(65, 100)
plt.legend(loc='upper right', prop={'size': 10}, labelspacing=0.1)
plt.xlabel('average number of options')
plt.ylabel('maximum sequence recovery [%]')
plt.tight_layout()
plt.savefig("graphs/max_recovery_md.svg")
plt.show()

In [None]:
V0 = df[['ref_sm', 'ref_si']].values
V1 = df[['mean_p_sm', 'mean_p_si']].values

#plt.figure(figsize=(3,3))
plt.figure(figsize=(4,2.8))
plt.plot(V0[:,0], 1e2*V0[:,1], '.', ms=6.0, color=colors[1], label="initial conformation")
plt.plot(V1[:,0], 1e2*V1[:,1], '.', ms=6.0, color=colors[0], label="average prediction")
#plt.xlim(1.5, 6.5)
plt.ylim(20, 85)
plt.legend(loc='upper right', prop={'size': 10}, labelspacing=0.1)
plt.xlabel('average number of options')
plt.ylabel('sequence recovery [%]')
plt.tight_layout()
plt.savefig("graphs/recovery_md.svg")
plt.show()

In [None]:
xR = 1e2*np.array([df[(df['pdbid'] == pdbid) & (df['mdid'] == 'bR')]['ref_si'].values for pdbid in df['pdbid'].unique()]).ravel()
yR = 1e2*np.array([df[(df['pdbid'] == pdbid) & (df['mdid'] == 'uR')]['ref_si'].values for pdbid in df['pdbid'].unique()]).ravel()
xL = 1e2*np.array([df[(df['pdbid'] == pdbid) & (df['mdid'] == 'bL')]['ref_si'].values for pdbid in df['pdbid'].unique()]).ravel()
yL = 1e2*np.array([df[(df['pdbid'] == pdbid) & (df['mdid'] == 'uL')]['ref_si'].values for pdbid in df['pdbid'].unique()]).ravel()
x = np.concatenate([xR,xL])
y = np.concatenate([yR,yL])

plt.figure(figsize=(3,3))
plt.plot(x,y,'.')
plt.plot([0,100],[0,100], 'k-', alpha=0.5)
plt.xlim(10, 90)
plt.ylim(10, 90)
plt.xlabel('bound sequence recovery [%]')
plt.ylabel('unbound sequence recovery [%]')
plt.show()