In [1]:
import math
import random
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader,TensorDataset
from torch.autograd import Variable

import numpy as np

from sklearn import preprocessing
import time

In [2]:
#Set device 有GPU则device=GPU，否则=CPU
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0] #不太明白这一句

Device = cuda


In [3]:
#计算 当前与选取参数start_time时的时间差，返回三个int型， 小时，分钟，秒
def time_elapsed(start_time):
    elapsed = time.time() - start_time  #.time()单位是浮点数的'秒'，时间间隔elapsed也是以浮点数“秒”来表示
    hours = int(elapsed/3600)           #elapsed/3600获得 小时数（自动对下取整
    minutes = int(int(elapsed/60)%60)   #elapsed/60获得分钟数，然后%60对60取余，获得 分钟数
    seconds = int(elapsed%60)           #elapsed%60对秒取余，获得 秒数
    
    return hours, minutes, seconds

In [4]:
def protein2int(protein, voc):
    
    protein_int= [ ]
    for char in protein:
        protein_int.append([float(i) for i, x in enumerate(voc) if x==char][0])

    protein_int = np.array(protein_int).astype(float)
    return protein_int

In [5]:
class TDDTransformerEncoder (nn.Module):
    def __init__(self, nembedding=22, ninp=1024, nhead=8, nhid=1024, nlayers=3, dropout=0.2):
        super(TDDTransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(nembedding, ninp)
        self.encoder_layers = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout,batch_first=True)#输入数据格式（batch，seqlenth，dim)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, nlayers)
        
        
    def forward(self,protein):
        output1 = self.embedding(protein)#(batch,seq_len,embedding)
        output1 = output1.unsqueeze(0)##(batch,seq_len,ninp)
        #print("output1 size",output1.size())
        output2 = self.transformer_encoder(output1)#(batch,seq_len,hidden_zise)
        protein_vector = torch.unsqueeze(torch.mean(output2, 1), 0)#(batch,1,hidden_zise)
        return protein_vector

In [6]:
class TDDTransformerDecoder (nn.Module):
    def __init__(self, nembedding=21, ninp=1024, nhead=8, nhid=1024, nlayers=3, dropout=0.2):
        super(TDDTransformerDecoder, self).__init__()
        self.hidden_size = nhid/2
        self.laryer = nlayers - 1
        self.Lstm = nn.LSTM(input_size = 35, hidden_size = 1024, num_layers = 3, batch_first = True, dropout = 0.2)
        self.linear = nn.Linear(1024, 35, bias = True)
        self.Relu = nn.ReLU()
        
    def forward(self,protein_vec, smiles, hidden): #对于一个靶点，每次传入相同protein_vec，不同的inp。逐个生成SMILES字符
        protein_vec = protein_vec.view(1,1,-1)#(1,1,batch*hidden_zise) batch = 1
        protein_vec = torch.cat((protein_vec,protein_vec,protein_vec),0)#(3,1,batch*hidden_zise)
        #hidden = protein_vec + hidden
        #print("protein_vec size",protein_vec.size())
        
        hidden = [hidden[0] + protein_vec,
                  hidden[1]] #hidden （num_layers, batch_size, hidden_size）
        smiles = smiles.unsqueeze(0)##(batch,seq_len,ninp)
        output1, hidden = self.Lstm(smiles, hidden)#hidden包括（h,c）,h隐藏状态（短期记忆），c记忆细胞（长期记忆）
        output = self.linear(output1)
        #每一次forward，都经历一边lstm+linear，返回计算结果output和隐藏层的states。output和input大小相同
        return output, hidden

In [7]:
class TDDTransformer (nn.Module):
    def __init__(self, nembedding=21, ninp=256, nhead=8, nhid=256, nlayers=3, dropout=0.2):
        super(TDDTransformer, self).__init__()
        self.Encoder = TDDTransformerEncoder()
        self.Decoder = TDDTransformerDecoder()
    
    def init_states(self, batch_size = 1):

        hidden = [Variable(torch.zeros(3, batch_size, 1024)),
                  Variable(torch.zeros(3, batch_size, 1024))]



        return hidden
        
    def forward(self, protein, smiles, hidden):#protein可以是一个protein，也可以是protein_vec。不同输入应对不同输出
        Encoder_output = self.Encoder(protein)#(batch,seq_len,embedding)
        Decoder_output = self.Decoder(Encoder_output, smiles, hidden)#(batch,seq_len,hidden_zise)
        return Decoder_output

In [8]:
def train(ep,No_data):
    #Set start time
    start_time = time.time()
    train_loss = []
    
    #Iterate set of seq_length characters 序列长度字符的迭代集
    for i, data in enumerate(train_loader):#所有的smiles都学一遍      
        #Initialize hidden and cell states(初始化
        oupt, inpt  = data
        protein_inpt = protein_int_data[i]
        protein_inpt = torch.from_numpy(protein_inpt)
        
        input_data = inpt.float()
        target_data = oupt.long()
        protein_inpt = protein_inpt.long()

        protein_inpt = protein_inpt.to(device)
        input_data = input_data.to(device)
        target_data = target_data.to(device)
        
        hidden = TDDTnet.init_states()

        hidden = (hidden[0].cuda(), hidden[1].cuda())
        
        #Set initial gradients
        #初始化梯度
        TDDTnet.zero_grad()
    
        #Set initial loss
        #初始化损失函数
        loss = 0 
 
        #Run model, calculate loss
        #forward一遍
        output, hidden= TDDTnet(protein_inpt, input_data, hidden)
        #计算一边loss，累加
#         print("output size",output.size())
#         print("target size",target_data.size())       
        loss += criterion(output.view(-1,35), target_data.view(-1))    #output.view(-1,LSTM_output_size)
    
        train_loss.append(loss)
        #Backpropagate loss
        #梯度反传
        loss.backward()
        
        #Clip gradients
        #梯度裁剪，防止梯度过大
        nn.utils.clip_grad_norm_(TDDTnet.parameters(), 3.0)
        
        #Optimize
        #优化，参数根据梯度和学习率学一边
        optimizer.step()
            
        #Update list of losses
        #更新损失列表，每50个batch记录一次
        if (i % 50 == 0):
            losses[0] = loss.data.item() / seq_length   
        losses.append(loss.data.item() / seq_length)

        #Intermediary saves
        #中间存储
        if (i % 10000 == 0):
                torch.save(TDDTnet.state_dict(), f"TDDTv4_temp.pth")

        #Print training info
        #输出训练信息
        hours, minutes, seconds = time_elapsed(start_time)
        print("epoch ",ep," dataset ",No_data, " training " + "Loss: {:0.6f}".format(loss.data.item() / seq_length) + " | ΔLossTotal: {:+0.4f}".format(losses[-1] - losses[1]) + " | Iteration: {0:04d}".format(i + 1) + " | Time elapsed: {0:02d}".format(hours) + "h {0:02d}".format(minutes) + " m {0:02d}".format(seconds) + " s")
        torch.cuda.empty_cache()


In [9]:
  
protein_all = open('../data/train_protein3.txt', "r").read()
protein_voc = list(set(protein_all))
protein_voc_num = len(protein_voc) 
protein_all = []


In [10]:
TDDTnet = TDDTransformer()
optimizer = torch.optim.Adam(TDDTnet.parameters(), lr = 0.0001)
criterion = nn.CrossEntropyLoss()#使用交叉熵损失函数
seq_length = 76

losses = [0]

if cuda:
    TDDTnet.cuda()
    criterion.cuda()

total_params = sum(p.numel() for p in TDDTnet.parameters())
    
print("Total number of parameters in TTDT: " + str(total_params))

Total number of parameters in TTDT: 46404643


In [12]:
#train
epoch = 50
for ep in range(epoch):
    print("start the epoch %d training!"%ep)
    
    for No_data in range(10):

        print("prepare protein data")
         
        protein_data = open('../data/train_protein_split%d.txt'%No_data, 'r').readlines()
        protein_int_data = np.array([protein2int(i, protein_voc) for i in tqdm(protein_data)])
    
        input_data = np.load("../data/OneHot_input_Data%d.npz"%No_data)
        input_data = input_data["arr_0"]
        input_data_tensor = torch.from_numpy(input_data)
        #.npz中保存的是smiles.txt对字符进行序号编码后的结果

        output_data = np.load("../data/Int_output_Data%d.npz"%No_data)
        output_data = output_data["arr_0"]
        output_data_tensor = torch.from_numpy(output_data)

        
        dataset = TensorDataset(output_data_tensor, input_data_tensor)
        train_loader = DataLoader(dataset = dataset, batch_size = 75, drop_last = True, shuffle = False)
        #Load SMILES data as integer labels and as one-hot encoding
        #将数据加载为整数标签和一个独热编码，ohesmiles.npz内保存的是smiles.txt经过one-hot后的结果
        print("input_data size: " + str(input_data_tensor.size()))             #data的三个维度，第0维表示字符，第1，2维是one-hot后的向量。
        print("output_data size: " + str(output_data_tensor.size()))  #intdata就是一个字符串，每一个字符代表一个smiles(的序号）。
        #现在data和intdata都是tensor，分别存储smiles.txt进行ont-hot编码和序号编码后的结果，size如下输
        #Run on GPU if available
        train(ep,No_data)
        
    torch.save(TDDTnet.state_dict(), "TDDTnetv4%d.pth"%int(ep))

start the epoch 0 training!
prepare protein data


100%|██████████████████████████████████████████████████████████████████████████| 46015/46015 [00:32<00:00, 1417.45it/s]
  protein_int_data = np.array([protein2int(i, protein_voc) for i in tqdm(protein_data)])


input_data size: torch.Size([3451125, 35])
output_data size: torch.Size([3451125])
epoch  0  dataset  0  training Loss: 0.046647 | ΔLossTotal: +0.0000 | Iteration: 0001 | Time elapsed: 00h 00 m 00 s
epoch  0  dataset  0  training Loss: 0.046421 | ΔLossTotal: -0.0002 | Iteration: 0002 | Time elapsed: 00h 00 m 01 s
epoch  0  dataset  0  training Loss: 0.046168 | ΔLossTotal: -0.0005 | Iteration: 0003 | Time elapsed: 00h 00 m 01 s
epoch  0  dataset  0  training Loss: 0.046218 | ΔLossTotal: -0.0004 | Iteration: 0004 | Time elapsed: 00h 00 m 01 s
epoch  0  dataset  0  training Loss: 0.045929 | ΔLossTotal: -0.0007 | Iteration: 0005 | Time elapsed: 00h 00 m 01 s
epoch  0  dataset  0  training Loss: 0.045611 | ΔLossTotal: -0.0010 | Iteration: 0006 | Time elapsed: 00h 00 m 01 s
epoch  0  dataset  0  training Loss: 0.045162 | ΔLossTotal: -0.0015 | Iteration: 0007 | Time elapsed: 00h 00 m 01 s
epoch  0  dataset  0  training Loss: 0.045140 | ΔLossTotal: -0.0015 | Iteration: 0008 | Time elapsed: 00h

epoch  0  dataset  0  training Loss: 0.032044 | ΔLossTotal: -0.0146 | Iteration: 0071 | Time elapsed: 00h 00 m 09 s
epoch  0  dataset  0  training Loss: 0.031532 | ΔLossTotal: -0.0151 | Iteration: 0072 | Time elapsed: 00h 00 m 09 s
epoch  0  dataset  0  training Loss: 0.033112 | ΔLossTotal: -0.0135 | Iteration: 0073 | Time elapsed: 00h 00 m 09 s
epoch  0  dataset  0  training Loss: 0.031460 | ΔLossTotal: -0.0152 | Iteration: 0074 | Time elapsed: 00h 00 m 09 s
epoch  0  dataset  0  training Loss: 0.031534 | ΔLossTotal: -0.0151 | Iteration: 0075 | Time elapsed: 00h 00 m 09 s
epoch  0  dataset  0  training Loss: 0.029118 | ΔLossTotal: -0.0175 | Iteration: 0076 | Time elapsed: 00h 00 m 09 s
epoch  0  dataset  0  training Loss: 0.032167 | ΔLossTotal: -0.0145 | Iteration: 0077 | Time elapsed: 00h 00 m 10 s
epoch  0  dataset  0  training Loss: 0.030514 | ΔLossTotal: -0.0161 | Iteration: 0078 | Time elapsed: 00h 00 m 10 s
epoch  0  dataset  0  training Loss: 0.032345 | ΔLossTotal: -0.0143 | It

KeyboardInterrupt: 

In [None]:
torch.save(TDDTnet.state_dict(), "TDDTnetv3F.pth")

In [None]:
# class PositionalEncoding(nn.Module):
 
#     def __init__(self, d_model, dropout=0.1, max_len=5000):
#         super(PositionalEncoding, self).__init__()
#         self.dropout = nn.Dropout(p=dropout)
 
#         pe = torch.zeros(max_len, d_model)
#         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
#         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
#         pe[:, 0::2] = torch.sin(position * div_term)
#         pe[:, 1::2] = torch.cos(position * div_term)
#         pe = pe.unsqueeze(0).transpose(0, 1)
#         self.register_buffer('pe', pe)
 
#     def forward(self, x):
#         x = x + self.pe[:x.size(0), :]
#         return self.dropout(x)

In [None]:
# protein_max_line = 0
# try:
#     file = open('../data/protein.txt', 'r')
# except FileNotFoundError:
#     print('Protein file is not found')
# else:
#     lines = file.readlines()
#     for line in lines:
#         if len(line)> protein_max_line:
#             protein_max_line = len(line)

# print("protein max line:", protein_max_line)

# protein_all = open('../data/protein.txt', "r").read()
# protein_voc = list(set(protein_all))
# protein_voc_num = len(protein_voc)

# print("protein voc num:", protein_voc_num)
# print("protein voc", protein_voc)

In [None]:
# le = preprocessing.LabelEncoder()
# protein_seq1 = "PVMVLENIEPEIVYAGYDSSKPDTAENLLSTLNRLAGKQMIQVVKWAKVLPGFKNLPLEDQITLIQYSWMCLLSFALSWRSYKHTNSQFLYFAPDLVFNEEKMHQSAMYELCQGMHQISLQFVRLQLTFEEYTIMKVLLLLSTIPKDGLKSQAAFEEMRTNYIKELRKMVQRFYQLTKLLDSMHDLVSDLLEFCFYTFRESHALKVEFPAMLVEIISDQLPKVESGNAKPLYFH"
# protein_seq1_int = protein2int(protein_seq1, protein_voc)
# print(protein_seq1_int)#

# embedding = nn.Embedding(21, 32)
# protein_seq1_array = np.asarray([eval(i) for i in protein_seq1_int])
# print("array:",protein_seq1_array)

# protein_seq1_tensor = torch.from_numpy(protein_seq1_array).unsqueeze(0)
# print("tensor size:",protein_seq1_tensor.size())
