In [1]:
import pickle

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [39]:
class BaseModule(nn.Module):
    

    def __init__(self,
                 cin: int,
                 hidden: int,
                 lin: int,
                 k_size=3,
                 stride=2,
                 pad=0):
                 
        super().__init__()
        def _conv_len(cin, hidden, lin,\
                            k_size=k_size, stride=stride, pad=pad):
            return (lin + 2*pad - k_size)//stride + 1

        self.conv1 = nn.Sequential(
                    nn.Conv1d(cin, hidden, k_size, stride, pad),
                    nn.LayerNorm((hidden, _conv_len(cin, hidden, lin))),
                    nn.Tanh()
                )
        self.linear = nn.Linear(hidden, 1)


    def forward(self, x: torch.Tensor):
        x = self.conv1(x)
        x = F.adaptive_avg_pool1d(x, (1))
        x_reshape = x.shape[:-1]
        x = self.linear(x.reshape(x_reshape))

        return x

## Featuring for embedding model

In [40]:
def labeled_protein(df_protein_sequence: pd.DataFrame, strings: str) -> list():
    labels = {key: str(label) for label,key in enumerate(strings)}
    labeled_protein = []
    length = []
    for protein in df_protein_sequence:
        trg = []
        for key in protein:
            trg.append(int(labels[key]))
        labeled_protein.append(torch.LongTensor(trg))
        length.append(len(trg))

    return labeled_protein, length

In [41]:
def emb_with_pad(labeled_df: list(),
                 strings: str,
                 emb_dim: int,
                 length: list(),
                 pad = False) -> list():

    max_len = max(length)
    em = nn.Embedding(len(strings), emb_dim)
    for idx, protein in enumerate(labeled_df):
        protein = em(protein)
        diff_len = max_len - len(protein)
        if pad:
            labeled_df[idx] = torch.concat(
                (protein, torch.zeros(diff_len, emb_dim))
                ).transpose(1, 0)
        else:
            labeled_df[idx] = protein.transpose(1, 0)
    df = labeled_df

    return df

In [67]:
from mymodule import except_outlier

with open('dataset/featured/base.pkl', 'rb') as f:
    data = pickle.load(f)
data = except_outlier(data, 'sequence_len')

strings = 'ABCDEFGHIJKLMNOPQRSTUVWSYZ'
proteins_sequence = data['protein_sequence']
labeled_df, length = labeled_protein(data['protein_sequence'], strings)

emb_dim = 4
embedded_df = emb_with_pad(labeled_df, strings, emb_dim, length, pad=True)

embedded_df = torch.stack(embedded_df, dim=0)
embedded_df.shape

torch.Size([29373, 4, 1011])

test

In [None]:
# cin = 4
# hidden = 7
# lin = 10
# test = torch.randn(3, cin, lin)

# def conv_len(cin, hidden, lin,\
#                     k_size=3, stride=2, pad=0):
#     return (lin + 2*pad - k_size)//stride + 1

# x = nn.Conv1d(cin, hidden, kernel_size=3, stride=2, padding=0)(test)
# #print('---'*20)
# #print(x.shape)
# #print(x)
# x = nn.LayerNorm((hidden, conv_len(cin, hidden, lin, 3, 2, 0)))(x)
# #print('---'*20)
# #print(x)
# tanh = nn.Tanh()
# x = tanh(x)
# #print('---'*20)
# print(x.shape)
# x = F.adaptive_avg_pool1d(x, (1))
# print('---'*20)
# print(x.shape)
# linear = nn.Linear(hidden, 1)
# x_reshape = x.shape[:-1]
# x = linear(x.reshape(x_reshape))
# print('---'*20)
# print(x)

In [None]:
# batch, cin, lin = (3, 3, 10)
# hidden = 7
# test = torch.randn(batch, cin, lin)

# model = BaseModule(cin, hidden, lin)
# model(test)