In [1]:
import re
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor, nn

device = (
    "cuda:0"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda:0 device


## Load Dataset

Embed each nt with a 0/1 vector [x,x,x,x]

In [2]:
DEG_dict = {   ## degenerative dict
    'A':'A',
    'U':'U',
    'C':'C',
    'G':'G',
    'R':'AG',
    'Y':'CT',
    'M':'AC',
    'K':'GT',
    'S':'GC',
    'W':'AT',
    'H':'ATC',
    'B':'GTC',
    'V':'GAC',
    'D':'GAT',
    'N':'ATCG'
    }

ORDER = list('AUCG')

def embed(nt):
    emb = [0 for i in ORDER]
    try:
        for x in list(DEG_dict[nt]):
            emb[ORDER.index(x)] = 1
    except:
        return [1 for i in ORDER]
    return emb


def tokenize_embed(seq):
    return [embed(nt) for nt in seq]

In [3]:
def read_fasta(file):
    data_header = []
    data_seq = []
    with open(file,'r') as f:
        while True:
            lineH = f.readline().strip()      
            lineS = f.readline().strip()   
            if not lineS:
                break
            data_header.append(lineH)
            data_seq.append(lineS)
    return list(zip(data_header,data_seq)) 

def header_parser_encoder(header):
    if 'Archaea' in header.split(';')[0]:
        return 0
    else:
        assert 'Bacteria' in header.split(';')[0]
        return 1


def collate_batch(data_batch, dtype=torch.float32):
    header_batch, seq_batch = [], []
    for header,seq in data_batch:
        header_batch.append(header_parser_encoder(header))
        seq_batch.append(torch.tensor(tokenize_embed(seq), dtype=dtype))
    header_batch = torch.tensor(header_batch, dtype=dtype)
    seq_batch = torch.nn.utils.rnn.pad_sequence(seq_batch, padding_value=float(0), batch_first=True) ## padded to equal
    return header_batch.to(device),seq_batch.permute(0,2,1).to(device)    ## [batch, embd(channal), seqlen]    

In [4]:
KMER = 10   
BATCH_SIZE = 512
EMBED_SIZE = 120
CLASS_NUM = 2

train_dl = torch.utils.data.DataLoader(read_fasta('/kaggle/input/a000000/train.fa'), batch_size=BATCH_SIZE, shuffle=True, 
                                       collate_fn = lambda x: collate_batch(x, dtype=torch.float32) )

In [5]:
# for tt in train_dl:
#     break

# tt[1].size()
# tt[0].size(), tt[1].size(), tt[0][0], tt[1][0]

## Model

input: 16S seq

output: Archaea/Bacteria


CNN --- LSTM --- Classifier

Todo: Optimize the model, the current one seems not functioning

Todo: Do not input strings, input kmer_abduance_list instead? (for small kmers, or the list could be too huge)

In [6]:
def train_epoch(dataloader, model, loss_fn, optimizer):
    lossSum = 0
    correctSum = 0
    model.train()                                    ### set training mode
    for (header_batch, seq_batch) in dataloader:
        pred = model(seq_batch)
        # Compute prediction error
        loss = loss_fn(pred.squeeze(-1),header_batch)
        lossSum += loss.item()
        correctSum += (pred.squeeze(-1).ge(1/2) == header_batch).type(torch.float).sum().item()
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print('>',end='')
    avgTrainingLoss = lossSum/len(dataloader)
    avgTrainingAcc  = correctSum/len(dataloader.dataset)
    return avgTrainingLoss,avgTrainingAcc


In [7]:
class ClassifierBinary(nn.Module):
    def __init__(self, sizeA):  ## out_class=2 is a must
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(sizeA, sizeA),
            nn.Dropout(),
            nn.ReLU(),
            
            nn.Linear(sizeA, sizeA),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(sizeA, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.mlp(x)


class CNN_RNN_Net(nn.Module):
    def __init__(self, kmer, out_class):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels = 4, 
                      out_channels = 10,
                      kernel_size = kmer,
                      stride = 2),
            nn.MaxPool1d(kernel_size = kmer, stride = 2),
            nn.ReLU(),
        )
        self.lstm = nn.LSTM(
            input_size=10, hidden_size=20, num_layers=1, batch_first=True, bidirectional=False
        )
        self.linearC = ClassifierBinary(20)
        
    def forward(self, input_batch):
        x = self.cnn(input_batch)
        x = x.permute(0,2,1)
        _, (x, _) = self.lstm(x)
        x = x.squeeze(0)
        x = self.linearC(x)
        return x



# for (h,s) in train_dl:
#     break

# s.size(),CNN_RNN_Net(7,2 )(s).size()


In [None]:
Model =  CNN_RNN_Net(KMER, 2).to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(Model.parameters(), lr=1e-5)

epochs = 30
for t in range(epochs):
    avgTrainingLoss,avgTrainingAcc = train_epoch(train_dl, Model, loss_fn, optimizer)
    print(f'Epoch {t+1}--Training loss:: {avgTrainingLoss:>7f}--Training Acc:: {avgTrainingAcc:>7f}') 

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Epoch 1--Training loss:: 0.697205--Training Acc:: 0.499950
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Epoch 2--Training loss:: 0.697040--Training Acc:: 0.500000
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>Epoch 3--Training loss:: 0.697684--Training Acc:: 0.500000
>>>>>>>>>>>>>>>>>>>>