In [1]:
import array, struct, sys, os, tqdm
import numpy as np

def read_binaries(path_binfiles):
    result = {}
    cnt = 0
    for binfile in tqdm.tqdm(os.listdir(path_binfiles)):
        pdbcode = binfile.split('.')[0]                     # name of file (pdbcode)
        F = open('{0}/{1}'.format(path_binfiles, binfile), 'rb')
        n_decoys = struct.unpack('i', F.read(4))[0]         # number of decoys (=19 for this dataset)
        dimension = struct.unpack('i', F.read(4))[0]        # data dimensionality (23 protein types x 40 ligand types x 7 bins for this dataset)
        res = []
        for i in range(n_decoys):
            label = struct.unpack('d', F.read(8))[0]        # label (1 for native, -1 for non-native)
            data = array.array('d')                         
            data.fromfile(F, dimension)                     # feature vector (histograms, can be represented as a 23x40x7 matrix) 
            res.append([label, data])
        result[pdbcode] = res
        F.close()
        if cnt == 13080:
            break
        cnt += 1
    return result

In [2]:
result = read_binaries('../../../../../basic_experiment/general-no2013_t14_t3_l7.0_g1.0_r1.0')

100%|█████████▉| 13076/13089 [01:49<00:00, 119.90it/s]

In [3]:
with open('../../../../../basic_experiment/affinity_data_refined.csv', 'r') as f:
    data = f.read().split('\n')
    data = data[1:-1]

In [4]:
datasets = [
    {'name': d.split(',')[0], 'value': d.split(',')[1], 'type': d.split(',')[3]}
    for d in data
]

In [5]:
Kd_values = []
Ki_values = []
for d in datasets:
    if d['type'] == 'Kd':
        Kd_values.append(d)
    else:
        Ki_values.append(d)

In [6]:
Kd_data = []
for item in Kd_values:
    Kd_data.append([item['value']] + result[item['name']])

In [7]:
Ki_data = []
for item in Ki_values:
    if item['name'] != '966c':
        Ki_data.append([item['value']] + result[item['name']])

# Кросс-валидация по Cr

In [8]:
import time
import numpy as np
from math import log, exp
from scipy.linalg import sqrtm, inv, norm
from scipy.optimize import minimize

In [9]:
data = Ki_data
train = data[:int(len(data) * 0.6)]
test = data[int(len(data) * 0.6):]

100%|█████████▉| 13076/13089 [02:00<00:00, 108.94it/s]

In [10]:
# Матрица признаков (для которых аффинности известны)
X_nat_train = []
for i, t in enumerate(train):
    additional = np.zeros(len(train))
    additional[i] = -1
    X_nat_train.append(np.append(t[1][1], additional))
X_nat_train = np.matrix(X_nat_train).T

# Столбец значений свободной энергии
s_train = np.matrix([
    float(t[0])
    for t in train
]).T

In [11]:
X_train = []
for i, t in enumerate(train):
    additional = np.zeros(len(train))
    additional[i] = -1
    for pose in t[1:]:
        X_train.append(np.append(pose[1], additional))
        
X_train = np.matrix(X_train).T
print(X_train.shape)

y_train = []
for t in train:
    for pose in t[1:]:
        y_train.append(pose[0])

y_train = np.matrix(y_train).T

(7685, 23655)


In [12]:
X_test = []
for t in test:
    for pose in t[1:]:
        X_test.append(pose[1])
        
X_test = np.matrix(X_test).T

y_test = []
for t in test:
    for pose in t[1:]:
        y_test.append(pose[0])

y_test = np.matrix(y_test).T

In [None]:
for Cr in [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]:
    # Замена переменных
    XXT = X_nat_train @ X_nat_train.T
    I = np.identity(XXT.shape[0])
    A = np.real(sqrtm(0.5 * I + Cr * XXT))
    A_inv = inv(A)
    B = Cr * A_inv @ X_nat_train @ s_train
    np.savetxt("cross_validation/A_inv_cr_" + str(Cr), A_inv)
    newX_train = (A_inv.T @ X_train).T
    
    # Запись обучающей выборки в файл
    with open("cross_validation/ki_train_cr_" + str(Cr), "w") as f:
        for i in range(newX_train.shape[0]):
            y_i = ("+1 " if y_train[i] == 1 else "-1 ")
            f.write(y_i)
            for j in range(newX_train.shape[1]):
                f.write(str(j + 1) + ":" + str(newX_train[i,j]) + " ")
            f.write("\n")
            
    print(Cr, "done")

# Accuracy

In [93]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [95]:
complexes_number = 831
poses_number = 19

# Матрица признаков (для которых аффинности известны)
X_test = np.matrix([
    t[1][1]
    for t in test
]).T

# Столбец значений свободной энергии
s_test = np.matrix([
    float(t[0])
    for t in test
]).T

for Cr in [1000]:
    print("Cr =", Cr)
    
    with open("cross_validation/ki_train_cr_" + str(Cr) + ".model", "r") as f:
        data = f.read().split("\n")
        
    # Tests (affinities)
    newW = np.array(data[6:-1], dtype=float).reshape((7685, 1))
    A_inv = np.loadtxt("cross_validation/A_inv_cr_" + str(Cr))
    B = Cr * A_inv @ X_nat_train @ s_train
    w = A_inv @ (newW + B)
    w = w[:6440]
    np.savetxt("cross_validation/ki_w_cr_" + str(Cr) + ".txt", w.T)
    prediction = w.T @ X_test
    print("Spearman: ", spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
    print("Pearson: ", pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
#     pearsons.append(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0])[0])
    print("R2: ", r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
#     Rs.append(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
    print("MSE: ", mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))
#     MSEs.append(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))
    print()

Cr = 1000
Spearman:  SpearmanrResult(correlation=0.61146242811614826, pvalue=2.333700456950958e-86)
Pearson:  (0.59015842434259658, 3.9534057926477655e-79)
R2:  -0.0166404880756
MSE:  4.55814511258



In [26]:
complexes_number = 831
poses_number = 19

# Матрица признаков (для которых аффинности известны)
X_test = np.matrix([
    t[1][1]
    for t in test
]).T

# Столбец значений свободной энергии
s_test = np.matrix([
    float(t[0])
    for t in test
]).T

Crs = [0.001, 0.5, 1, 10, 50, 100, 500, 1000, 10000]
pearsons = []
Rs = []
MSEs = []

for Cr in Crs:
    print("Cr =", Cr)
    
    with open("cross_validation/ki_train_cr_" + str(Cr) + ".model", "r") as f:
        data = f.read().split("\n")
        
#     # Accuracy (poses):
#     newW = np.array(data[6:-1], dtype=float).reshape((6440, 1))
#     predicted_labels = (newW.T @ newX_test.T).tolist()[0]
#     y_pred = [
#        1 if label > 0 else -1
#        for label in predicted_labels
#     ]
#     cnt = 0
#     for i in range(complexes_number):
#        flag = True
#        for j in range(poses_number):
#            index = i * poses_number + j
#            if (y_pred[index] != y_test[index]):
#                flag = False
#        if flag is True:
#            cnt += 1
#     print("Accuracy: ", cnt / complexes_number)
    
    # Tests (affinities)
    newW = np.array(data[6:-1], dtype=float).reshape((6440, 1))
    A_inv = np.loadtxt("cross_validation/A_inv_cr_" + str(Cr))
    B = Cr * A_inv @ X_nat_train @ s_train
    w = A_inv @ (newW + B)
    np.savetxt("cross_validation/ki_w_cr_" + str(Cr) + ".txt", w.T)
    prediction = w.T @ X_test
    print("Spearman: ", spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
    print("Pearson: ", pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
    pearsons.append(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0])[0])
    print("R2: ", r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
    Rs.append(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
    print("MSE: ", mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))
    MSEs.append(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))
    print()

Cr = 0.001
Spearman:  SpearmanrResult(correlation=0.30471902323731476, pvalue=2.5758836373209146e-19)
Pearson:  (0.35866623651020552, 1.2473272021408322e-26)
R2:  -11.0861992504
MSE:  54.1889199662

Cr = 0.5
Spearman:  SpearmanrResult(correlation=0.6076840180397568, pvalue=4.90090190471438e-85)
Pearson:  (0.59180224546894356, 1.1436594187101856e-79)
R2:  -0.552718184241
MSE:  6.96166922892

Cr = 1
Spearman:  SpearmanrResult(correlation=0.63102411423000182, pvalue=1.6872998537515098e-93)
Pearson:  (0.61666435557832411, 3.29722571976743e-88)
R2:  -0.32235431452
MSE:  5.92882432534

Cr = 10
Spearman:  SpearmanrResult(correlation=0.67918124836364113, pvalue=1.8094526009438593e-113)
Pearson:  (0.67283646653283347, 1.274999890647994e-110)
R2:  0.157731149343
MSE:  3.776343447

Cr = 50
Spearman:  SpearmanrResult(correlation=0.69020876714633739, pvalue=1.3564550163312283e-118)
Pearson:  (0.6833987058058153, 2.1099808037095246e-115)
R2:  0.330855081718
MSE:  3.00013591299

Cr = 100
Spearman:  S

In [43]:
import matplotlib.pyplot as plt
%matplotlib auto

Using matplotlib backend: MacOSX


In [55]:
plt.plot(np.log(np.array(Crs)), pearsons, label="Pearson")
plt.xlabel(r"$\ln({C_r})$")
plt.ylabel(r"Pearson's correlation")
plt.show()

In [56]:
plt.plot(np.log(np.array(Crs)), Rs, label="R^2")
plt.xlabel(r"$\ln({C_r})$")
plt.ylabel(r"$R^2$")
plt.show()

In [58]:
plt.plot(np.log(np.array(Crs)), MSEs, label="MSE")
plt.xlabel(r"$\ln({C_r})$")
plt.ylabel(r"$MSE$")
plt.show()