## TODO
1. Изменить константы в модели
2. Сделать модель для произвольного числа атомов
3. Добавить больше признаков, в том числе задействующие 3 и более вершины
4. Разобраться с тем, почему не получается считать все файлы

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import os
%matplotlib inline

Приготовим данные

In [2]:
import re

def line_processing(line):
    'для извлечения чисел из матриц'
    line = re.split('[^0-9,.,]+', line)
    i = 0
    while(i < len(line)):
        if(line[i] == ''):
            line.pop(i)
        else:
            i += 1
    return line

def read_matrix_row(file):
    line = file.readline()
    while(line[-2] != ']'):
        tmp = file.readline()
        line += tmp
    return line_processing(line)

def read_matrix(file):
    line = read_matrix_row(file)
    #количество атомов
    n  = len(line)
    
    result = []
    for i in range(n):
        row = []
        if(n > len(line)):
            print("hi", i)
            print(line)
        for j in range(n):
            row.append(float(line[j]))
        if(i < n - 1):
            line = read_matrix_row(file)
        result.append(row)
    return np.array(result)

def read_atoms_features_and_labels(file, n): 
    features = []
    labels = []
    for _ in range(n):
        line = file.readline()
        line = line.split()
        #add as features element electronegativity, is in ring, triple product
        #if triple product is None, we consider it 0
        features.append([float(line[3]), float(line[7]), 0.0 if line[8] == 'None' else float(line[8])])
        #type2 is a label
        labels.append(float(line[4]))
    return np.array(features), np.array(labels)

In [3]:
def read_molecule(path):
    file = open(path, 'r')
    file.readline()
    adjacency_matrix = read_matrix(file)
    file.readline()
    distance_matrix = torch.from_numpy(read_matrix(file)).float()
    
    for _ in range(3):
        file.readline()
    atoms_features, atoms_labels = read_atoms_features_and_labels(file, distance_matrix.shape[0])
    atoms_features = Variable(torch.from_numpy(atoms_features).float())
    atoms_labels = Variable(torch.from_numpy(atoms_labels).float())
    
    file.readline()
    bond_length = torch.from_numpy(read_matrix(file)).float()
    pairs_features = Variable(torch.cat([distance_matrix[..., np.newaxis], bond_length[..., np.newaxis]], dim=2))
    
    file.close()
    
    # Проверим, что всё хорошо считали
    assert(adjacency_matrix.shape[0] == adjacency_matrix.shape[1])
    assert(distance_matrix.shape[0] == distance_matrix.shape[1])
    assert(bond_length.shape[0] == bond_length.shape[1])
    assert(distance_matrix.shape[0] == adjacency_matrix.shape[1])
    assert(bond_length.shape[0] == adjacency_matrix.shape[1])
    
    #Теперь приготовим входные данные для сети
    inputs_a = Variable(torch.Tensor(atoms_features.shape[0], 1, atoms_features.shape[0], atoms_features.shape[1]))
    inputs_p = Variable(torch.Tensor(atoms_features.shape[0], 1, pairs_features.shape[0], pairs_features.shape[1], pairs_features.shape[2]))
    for i in range(atoms_features.shape[0]):
        inputs_a[i] = atoms_features
        inputs_p[i] = pairs_features
    labels=atoms_labels.long()
    labels.add_(-1)
    
    return [inputs_a, inputs_p, adjacency_matrix, labels]

In [4]:
train = []
test = []
# путь к директории, где лежат модели
data_path = "mol-descs/descs/"
pathes = os.listdir(path=data_path)
pathes = [path for path in pathes if path[-4:] == '.dat']

Следующие несколько ячеек не объединены в одну только потому, что мой комп выключается

In [5]:
for path in pathes[:800]:
    train.append(read_molecule(data_path + path))

In [6]:
for path in pathes[800:1600]:
    train.append(read_molecule(data_path + path))

In [7]:
for path in pathes[1600:2400]:
    test.append(read_molecule(data_path + path))

In [8]:
a = [train_[0].shape[0] for train_ in train]
bc = np.bincount(a)
np.argmax(bc)

27

Поэтому пока настроим сеть для молекул из 27 атомов

In [9]:
class Net(nn.Module):
    def __init__(self, wave_modules_count=1):
        super(Net, self).__init__()
        self.wave_modules_count = wave_modules_count
        self.conv1 = nn.Conv1d(1, 1, 3)
        self.conv2 = nn.Conv1d(1, 1, 2)
        self.fc1 = nn.Linear(27 * 2, 4)
    
    def wave_module(self, x, y, adjacency_matrix):
        # (A->A)
        "TODO заменить константу в размере вектора"
        x1 = Variable(torch.Tensor(x.shape[0], x.shape[1], x.shape[2], 1))
        for i in range(x1.shape[2]):
            x1[:, :, i] = F.relu(self.conv1(x[:, :, i]))
        
        # (P->P)
        "TODO заменить константу в размере вектора"
        y1 = Variable(torch.Tensor(y.shape[0], y.shape[1], y.shape[2], y.shape[3], 1))
        for i in range(y1.shape[2]):
            for j in range(y1.shape[3]):
                y1[:, :, i, j] = F.relu(self.conv2(y[:, :, i, j]))
                
        # (P->A)
        # сначала получим для каждой вершины смежные вершины
        edges = []
        for i in range(adjacency_matrix.shape[0]):
            tmp = []
            for j in range(len(adjacency_matrix[i])):
                if(adjacency_matrix[i][j] == 1):
                    tmp.append(j)
            edges.append(tmp)
        # теперь непосредственно получаем новый слой, используя y1
        "TODO заменить константу в размере вектора"
        x2 = Variable(torch.Tensor(x.shape[0], x.shape[1], x.shape[2], 1))
        for i in range(x2.shape[0]):
            x2[:, :, i] = y1[:, :, i, edges[i][0]]
            for j in range(1, len(edges[i])):
                x2[:, :, i] = x2[:, :, i] + y1[:, :, i, edges[i][j]]
            #x2[:, :, i] = torch.sum([y1[:, :, i, b] for b in edges[i]], dim=2)
        
        # (A -> P)
        "TODO заменить константу в размере вектора"
        y2 = Variable(torch.Tensor(y.shape[0], y.shape[1], y.shape[2], y.shape[3], 4))
        for i in range(y2.shape[2]):
            for j in range(y2.shape[3]):
                y2[:, :, i, j] = torch.add(F.relu(self.conv1(torch.cat([x[:, :, i], x[:, :, j]], dim=2))),
                           F.relu(self.conv1(torch.cat([x[:, :, j], x[:, :, i]], dim=2))) )
        
        # теперь плучаем новые атомный и парный слои
        x_ret = torch.cat([x1, x2], dim = 3)
        y_ret = torch.cat([y1, y2], dim = 4)
        
        return [x_ret, y_ret]
        
    def forward(self, x, y, adjacency_matrix):
        for _ in range(self.wave_modules_count):
            x, y = self.wave_module(x, y, adjacency_matrix)
        
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        return x
net = Net()

Параметры оптимизатора взяты из модели

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adagrad(net.parameters(), lr=0.003)

In [11]:
for epoch in range(2):  # loop over the dataset multiple times
    
    appopr_data_pos = 0
    running_loss = 0.0
    for i, data in enumerate(train, 0):
        # get the inputs
        inputs_a, inputs_p, adjacency_matrix, labels = data
        if(inputs_a.shape[0] == 27):
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net.forward(inputs_a, inputs_p, adjacency_matrix)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.data[0]
            if appopr_data_pos % 10 == 9:    # print every 10 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, appopr_data_pos + 1, running_loss / 10))
                running_loss = 0.0
            appopr_data_pos += 1

[1,    10] loss: 0.922
[1,    20] loss: 0.865
[1,    30] loss: 0.671
[1,    40] loss: 0.927
[1,    50] loss: 0.833
[1,    60] loss: 0.706
[2,    10] loss: 0.709
[2,    20] loss: 0.768
[2,    30] loss: 0.642
[2,    40] loss: 0.893
[2,    50] loss: 0.813
[2,    60] loss: 0.682


In [12]:
correct = 0
total = 0
for data in test:
    inputs_a, inputs_p, adjacency_matrix, labels = data
    if(inputs_a.shape[0] == 27):
        outputs = net(inputs_a, inputs_p, adjacency_matrix)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.data).sum()

print('Accuracy of the network on the 60 test molecules: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 60 test molecules: 55 %
