### Считываем данные (refined dataset)

In [1]:
import array, struct, sys, os, tqdm
import numpy as np

def read_binaries(path_binfiles):
    result = {}
    cnt = 0
    for binfile in tqdm.tqdm(os.listdir(path_binfiles)):
        pdbcode = binfile.split('.')[0]                     # name of file (pdbcode)
        F = open('{0}/{1}'.format(path_binfiles, binfile), 'rb')
        n_decoys = struct.unpack('i', F.read(4))[0]         # number of decoys (=19 for this dataset)
        dimension = struct.unpack('i', F.read(4))[0]        # data dimensionality (23 protein types x 40 ligand types x 7 bins for this dataset)
        res = []
        for i in range(n_decoys):
            label = struct.unpack('d', F.read(8))[0]        # label (1 for native, -1 for non-native)
            data = array.array('d')                         
            data.fromfile(F, dimension)                     # feature vector (histograms, can be represented as a 23x40x7 matrix) 
            res.append([label, data])
        result[pdbcode] = res
        F.close()
        if cnt == 13080:
            break
        cnt += 1
    return result

In [4]:
result = read_binaries('../../../../../basic_experiment/general-no2013_t14_t3_l7.0_g1.0_r1.0')

100%|█████████▉| 13067/13089 [01:46<00:00, 122.67it/s]

In [5]:
with open('../../../../../basic_experiment/affinity_data_refined.csv', 'r') as f:
    data = f.read().split('\n')
    data = data[1:-1]

In [6]:
datasets = [
    {'name': d.split(',')[0], 'value': d.split(',')[1], 'type': d.split(',')[3]}
    for d in data
]

In [7]:
Kd_values = []
Ki_values = []
for d in datasets:
    if d['type'] == 'Kd':
        Kd_values.append(d)
    else:
        Ki_values.append(d)

In [8]:
Kd_data = []
for item in Kd_values:
    Kd_data.append([item['value']] + result[item['name']])

In [9]:
Ki_data = []
for item in Ki_values:
    if item['name'] != '966c':
        Ki_data.append([item['value']] + result[item['name']])

### 1 этап. Предсказание свободной энергии.
Для обучения рассматриваем для каждого комплекса только его нативную позу (т.к. только для них известны значения свободной энергии).

Берем все нативные позы со значениями Ki (Ki_data) из refined dataset.

Предсказываем значение Ki.

### Работа с данными:

    1) Разделение данных на test и train и выделение аффинных данных (X_nat_train)
    
    2) Замена переменных
    
    3) Запись в файл нового вектора X (для работы в liblinear)

In [10]:
import time
import numpy as np
from math import log, exp
from scipy.linalg import sqrtm, inv, norm
from scipy.optimize import minimize

In [11]:
start_time = time.time()
data = Ki_data
train = data[:int(len(data) * 0.6)]
test = data[int(len(data) * 0.6):]

# Матрица признаков (для которых аффинности известны)
X_nat_train = np.matrix([
    t[1][1]
    for t in train
]).T

# Столбец значений свободной энергии
s_train = np.matrix([
    float(t[0])
    for t in train
]).T
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.617832899093628 seconds ---


In [12]:
print(X_nat_train.shape)
print(s_train.shape)

(6440, 1245)
(1245, 1)


In [13]:
start_time = time.time()
X_train = []
for t in train:
    for pose in t[1:]:
        X_train.append(pose[1])
        
X_train = np.matrix(X_train).T

y_train = []
for t in train:
    for pose in t[1:]:
        y_train.append(pose[0])

y_train = np.matrix(y_train).T
print("--- %s seconds ---" % (time.time() - start_time))

100%|█████████▉| 13067/13089 [02:19<00:00, 93.49it/s] 

--- 19.805788040161133 seconds ---


In [14]:
print(X_train.shape)
print(y_train.shape)

(6440, 23655)
(23655, 1)


In [15]:
# Замена переменных
start_time = time.time()
Cr = 100 # Коэффициент регуляризации
XXT = X_nat_train @ X_nat_train.T
I = np.identity(XXT.shape[0])
A = np.real(sqrtm(0.5 * I + Cr * XXT))
print("--- %s seconds ---" % (time.time() - start_time))

--- 214.7095491886139 seconds ---


In [16]:
start_time = time.time()
A_inv = inv(A)
B = Cr * A_inv @ X_nat_train @ s_train
print("--- %s seconds ---" % (time.time() - start_time))

--- 13.45524001121521 seconds ---


In [17]:
print(A.shape)
print(B.shape)

(6440, 6440)
(6440, 1)


In [18]:
newX = (A_inv.T @ X_train).T
print(newX.shape)

(23655, 6440)


In [19]:
with open("ki_train", "w") as f:
    for i in tqdm.tqdm(range(newX.shape[0])):
        y_i = ("+1 " if y_train[i] == 1 else "-1 ")
        f.write(y_i)
        for j in range(newX.shape[1]):
            f.write(str(j + 1) + ":" + str(newX[i,j]) + " ")
        f.write("\n")


  0%|          | 0/23655 [00:00<?, ?it/s][A
  0%|          | 1/23655 [00:00<44:18,  8.90it/s][A
  0%|          | 2/23655 [00:00<1:00:15,  6.54it/s][A
  0%|          | 3/23655 [00:00<57:45,  6.83it/s]  [A
  0%|          | 6/23655 [00:00<38:00, 10.37it/s][A
  0%|          | 9/23655 [00:00<31:28, 12.52it/s][A
  0%|          | 11/23655 [00:00<29:37, 13.30it/s][A
  0%|          | 14/23655 [00:00<26:38, 14.79it/s][A
  0%|          | 16/23655 [00:01<25:55, 15.20it/s][A
  0%|          | 18/23655 [00:01<27:19, 14.42it/s][A
  0%|          | 20/23655 [00:01<26:38, 14.79it/s][A
  0%|          | 23/23655 [00:01<25:12, 15.63it/s][A
  0%|          | 26/23655 [00:01<24:06, 16.33it/s][A
  0%|          | 29/23655 [00:01<22:59, 17.13it/s][A
  0%|          | 32/23655 [00:01<22:04, 17.84it/s][A
  0%|          | 35/23655 [00:01<21:27, 18.35it/s][A
  0%|          | 39/23655 [00:02<20:34, 19.13it/s][A
  0%|          | 43/23655 [00:02<19:55, 19.76it/s][A
  0%|          | 46/23655 [00:02<19:3

In [21]:
constant = np.multiply(y_train, ((A_inv @ B).T @ X_train).T)
print(constant.shape)

(23655, 1)


In [23]:
float(constant[0])

8.619652833637915

In [24]:
with open("ki_train_constant", "w") as f:
    for i in tqdm.tqdm(range(constant.shape[0])):
        f.write(str(float(constant[i])) + "\n")

100%|██████████| 23655/23655 [00:00<00:00, 69543.85it/s]


### Строим модель в liblinear, достаем вектор $w$ из файла .model и тестируем модель.

In [25]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [26]:
with open("../ki_train.model", "r") as f:
    data = f.read().split("\n")

In [27]:
newW = np.array(data[6:-1], dtype=float).reshape((6440, 1))

In [28]:
w = A_inv @ (newW + B)

In [29]:
# Матрица признаков (для которых аффинности известны)
X_test = np.matrix([
    t[1][1]
    for t in test
]).T

# Столбец значений свободной энергии
s_test = np.matrix([
    float(t[0])
    for t in test
]).T

In [30]:
# L2_LR с добавкой Constant
prediction = w.T @ X_test
print("Spearman: ", spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print("Pearson: ", pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print("R2: ", r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print("MSE: ", mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

Spearman:  SpearmanrResult(correlation=0.69015191946932408, pvalue=1.4434995449354607e-118)
Pearson:  (0.68219302714172125, 7.5909374048635784e-115)
R2:  0.343886262877
MSE:  2.9417101318


In [40]:
# L2_LR с добавкой Constant
prediction = w.T @ X_test
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.69015191946932408, pvalue=1.4434995449354607e-118)
(0.68219302714172125, 7.5909374048635784e-115)
0.343886262877
2.9417101318


In [34]:
# L2_LOSS_SVC_DUAL
prediction = w.T @ X_test
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.69322426613187826, pvalue=4.9040465422768512e-120)
(0.68575165540911154, 1.7036531159905999e-116)
0.350186214271
2.91346406744


In [26]:
# Тренинговая аффинная выборка и Cr = 100 (вместо 10000), C = 1024
prediction = w.T @ X_test
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.68356234628229917, pvalue=1.7725783590839373e-115)
(0.67049209677989263, 1.3800783330340288e-109)
-0.146462513968
5.14020695231


In [23]:
# Тренинговая аффинная выборка и Cr = 10000 (вместо 100), C = 1024
prediction = w.T @ X_test_reg
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.62014223685389513, pvalue=2.6774230060625381e-133)
(0.60320970317061684, 2.4116978508802566e-124)
-0.209361431428
5.41602792232


In [76]:
# Добавил полную аффинную выборку и Cr = 100 (вместо 10), C = 1024
prediction = w.T @ X_test_reg
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.78110546631570565, pvalue=1.0389205936255559e-256)
(0.77570934211099696, 6.0031501273757858e-251)
0.327181243734
3.01316469659


In [65]:
# Добавил полную аффинную выборку и Cr = 5 (вместо 0.5), C = 1024
prediction = w.T @ X_test_reg
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.68905982191288717, pvalue=3.597549765965038e-176)
(0.67739176428319892, 4.3851607724117059e-168)
-0.233452411741
5.52391744043


In [47]:
# Аффинная выборка только из тренинговой выборки, Cr = 0.5, C = 1024
prediction = w.T @ X_test_reg
print(spearmanr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(pearsonr(np.array(s_test.T)[0], np.array(prediction)[0]))
print(r2_score(np.array(s_test.T)[0], np.array(prediction)[0]))
print(mean_squared_error(np.array(s_test.T)[0], np.array(prediction)[0]))

SpearmanrResult(correlation=0.59782292604876919, pvalue=1.3171390817460266e-121)
(0.5853307218911773, 1.8685088214099835e-115)
-1.06153063991
9.23239919705
