In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
import torch.utils.data as Data
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [2]:
import time
import hashlib

import numpy as np
import pandas as pd



In [3]:
data = pd.read_csv("data/data.csv").to_numpy()

In [4]:
vocab = np.array(['T', 'C', 'G', 'A'])
vocab = np.insert(vocab, 0, '')
chunk_tags = np.unique(data[:, 0])
print(vocab, chunk_tags)

['' 'T' 'C' 'G' 'A'] ['Cytoplasm' 'Exosome' 'Nucleus' 'Ribosome']


In [5]:
def k_mers(data, n):
    arrays = []
    for s in data:
        array = []
        for i in range(len(s) - n):
            array.append(s[i:i + n].lower())
        arrays.append(array)
    return arrays

In [6]:
data[:,1] = k_mers(data[:,1], 5)

In [7]:
from collections import Counter

def build_vocab(data):
    word_counts = Counter(row.lower() for sample in data for row in sample)
    vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
    return vocab

In [8]:
vocab = build_vocab(data[:, 1])

In [9]:
def pad_sequences(arrays, maxlen, value=0):
    mask = []
    for i in range(len(arrays)):
        arrays[i] = arrays[i][:maxlen]
        while(len(arrays[i]) < maxlen):
            arrays[i].append(value)
        mask.append(np.where(arrays[i] == value))
    return arrays, mask

def process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):

    word2idx = dict((w, i) for i, w in enumerate(vocab)) # 製作 word 2 id array
    x = [[word2idx.get(w, 1) for w in s[1]] for s in data]  # set to <unk> (index 1) if not in vocab
    
    y_chunk = [list(chunk_tags).index(s[0]) for s in data]

    x, mask = pad_sequences(x, maxlen)  # left padding
    # y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
    
    return x, mask, y_chunk

x, mask, y_chunk = process_data(data, vocab, chunk_tags, maxlen=10000, onehot=False)


In [10]:
import torch
import torch.nn as nn

class Bilstm(torch.nn.Module):
    """BiLSTM Model 的初始化

    Args:
        nvocab (int): 詞庫 size
        ntoken (int): 輸出的類別 size
        ninp (int): 詞向量的 size
        nhid (int): 隱藏層 size
        nlayers (int): LSTM的層數
        dropout (float, optional): 如果不為零，則在除最後一層以外的每個LSTM層的輸出上引入一個Dropout層，其丟棄概率等於 dropout . Defaults to 0.0.
        batch_first (bool, optional): 如果為True，則將輸入和輸出張量提供為（批次，序列）. Defaults to True.
    """
    def __init__(self, nvocab: int, ntoken: int, ninp: int, nhid: int, nlayers: int, dropout: float=0.0, batch_first: bool=True):
        
        super(Bilstm, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(nvocab, ninp, padding_idx=0)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout, bidirectional=True, batch_first=batch_first)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        """初始化 embedding, liner 兩層的 weight
        """
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x: torch.tensor):
        """運算一次

        Args:
            x (torch.tensor): 詞 tensor。如果batch_first=True，x為（批次，序列），否則（序列，批次）。

        Returns:
            [torch.tensor]: 運算完的3D向量，多了一維類別機率分數。
            [torch.tensor]: LSTM層的Hidden weight。
        """
        batch_size = x.size()[0]

        emb = self.drop(self.encoder(x))
        
        hid = self.init_hidden(batch_size)
        output, (final_hidden_state, final_cell_state) = self.rnn(emb, hid)
        final_hidden_state = self.drop(final_hidden_state)
        decoded = self.decoder(final_hidden_state[-1])
        return decoded

    def init_hidden(self, bsz: int):
        """取得LSTM shape size 的初始化  weight

        Args:
            bsz (int): batch_size

        Returns:
            [tuple]: LSTM shape size 的初始化  weight
        """
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers * 2, bsz, self.nhid), weight.new_zeros(self.nlayers * 2, bsz, self.nhid))
    
    def predict(self, x: torch.tensor):
        """預測並輸出機率大的類別

        Args:
            x (torch.tensor): 詞 tensor。如果batch_first=True，input shape為（批次，序列），否則（序列，批次）。

        Returns:
            [torch.tensor]: shape 與 x 一樣，但是序列為類別序列。
        """
        decoded, _  = self.forward(x)
        _, output = decoded.max(dim=2)

        return output

In [11]:
nvocab = len(vocab)
ntoken = len(chunk_tags)

ninp = 32
nhid = 32
nlayers = 2

model = Bilstm(nvocab, ntoken, ninp, nhid, nlayers)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)

batch_size = 32
epochs = 50

tensor_x = torch.tensor(x).to(torch.int64)
tensor_mask = torch.tensor(mask).to(torch.int64)
tensor_y = torch.tensor(y_chunk).to(torch.int64)


dataset = Data.TensorDataset(tensor_x, tensor_mask, tensor_y)

train_set, valid_set = Data.random_split(dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)], generator=torch.Generator().manual_seed(42))

loader = Data.DataLoader(
    dataset = train_set,
    batch_size = batch_size,
)

start_time = time.time()
step_size = len(loader)
for epoch in range(epochs):
    for step, (batch_x, batch_mask, batch_y) in enumerate(loader):
        step_time = time.time()
        model.zero_grad()
        pred = model(batch_x)
        
        loss = loss_fn(pred.softmax(dim=1), batch_y)
        print('Epoch: %i | Step: %i/%i | Loss: %.2f | time: %.2f s' % (epoch, step, step_size, loss, time.time() - step_time))
        loss.backward()
        optimizer.step()

print('all time : ', time.time() - start_time,'s')

Epoch: 0 | Step: 0/15 | Loss: 1.39 | time: 1.96 s
Epoch: 0 | Step: 1/15 | Loss: 1.39 | time: 1.91 s
Epoch: 0 | Step: 2/15 | Loss: 1.39 | time: 2.05 s
Epoch: 0 | Step: 3/15 | Loss: 1.39 | time: 1.98 s
Epoch: 0 | Step: 4/15 | Loss: 1.39 | time: 1.97 s
Epoch: 0 | Step: 5/15 | Loss: 1.39 | time: 1.97 s
Epoch: 0 | Step: 6/15 | Loss: 1.39 | time: 2.13 s
Epoch: 0 | Step: 7/15 | Loss: 1.39 | time: 2.25 s
Epoch: 0 | Step: 8/15 | Loss: 1.39 | time: 2.04 s
Epoch: 0 | Step: 9/15 | Loss: 1.39 | time: 2.32 s
Epoch: 0 | Step: 10/15 | Loss: 1.39 | time: 2.13 s
Epoch: 0 | Step: 11/15 | Loss: 1.38 | time: 2.10 s
Epoch: 0 | Step: 12/15 | Loss: 1.38 | time: 2.08 s
Epoch: 0 | Step: 13/15 | Loss: 1.38 | time: 2.09 s
Epoch: 0 | Step: 14/15 | Loss: 1.39 | time: 1.06 s
Epoch: 1 | Step: 0/15 | Loss: 1.38 | time: 2.11 s
Epoch: 1 | Step: 1/15 | Loss: 1.38 | time: 2.08 s
Epoch: 1 | Step: 2/15 | Loss: 1.39 | time: 2.08 s
Epoch: 1 | Step: 3/15 | Loss: 1.38 | time: 2.10 s
Epoch: 1 | Step: 4/15 | Loss: 1.38 | time: 2.

KeyboardInterrupt: 

In [None]:
seq_list = list(df['Seq'])
loc_list = list(df['sublocation'])
for i in range(len(df.index)):
    df['Seq'][i] = hashlib.md5(df['Seq'][i].encode('utf-8')).hexdigest()

In [None]:
df.to_csv("data/data_MD5.csv", index=False)