In [34]:
import sklearn
from pathlib import Path
import torch
import pandas as pd
import pickle
import torch.nn as nn

In [36]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu" 

In [19]:
data_file = Path('data/processed/protein_data_2021-04-04.csv')
data_file = file_paths / data_file
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Sequence,Subcellular location [CC],Location
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec...",Cytoplasm
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...,Endosome
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,...",Cytoplasm
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...,Mitochondrion
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Cell membrane


In [20]:
df.drop(columns=['Subcellular location [CC]'], inplace=True)
df.head()

Unnamed: 0,Sequence,Location
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Cytoplasm
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Endosome
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Cell membrane


In [21]:
def create_vocab(df, protein_seqs_column, kmer_sz, stride=1, eos_token=True):
    kmers = set()
        
    # Map kmers for one-hot encoding
    kmer_to_id = dict()
    id_to_kmer = dict()

    # Loop over the protein sequences
    for protein_seq in df[protein_seqs_column]:
        # Loop over the sequence and add the amino acid if it is not in kmers set.
        seq_len = len(protein_seq)


        for i in range(0, seq_len - (kmer_sz - 1), stride):

            kmer = protein_seq[i: i + kmer_sz]

            if kmer not in list(kmers):
                ind = len(kmers)
                kmers.add(kmer)

                # Also create the dictionary
                kmer_to_id[kmer] = ind
                id_to_kmer[ind] = kmer

    if eos_token:
        token = '<EOS>'
        ind = len(kmers)
        
        kmers.add(token)

        # Also create the dictionary
        kmer_to_id[token] = ind
        id_to_kmer[ind] = token

    vocab_sz = len(kmers)

    assert vocab_sz == len(kmer_to_id.keys())
    
    return kmer_to_id, id_to_kmer, vocab_sz

In [22]:
def tokenize(df, protein_seqs_column, kmer_sz, stride=1, eos_token=True, premade_vocab=False):
    
    
    # Create the vocabulary
    if not premade_vocab:
        
        kmer_to_id, id_to_kmer, vocab_sz = create_vocab(df, protein_seqs_column, kmer_sz, stride, eos_token)
                
    else:
        kmer_to_id, id_to_kmer = premade_vocab
        vocab_sz = len(kmer_to_id.keys())
            
    # Tokenize the sequences in the DF

    tokenized = []
    for i, protein_seq in enumerate(df[protein_seqs_column], 0):
        sequence = []
        
        # If the kmer can't be found these indexes should be deleted
        remove_idxs = []
        
        # Loop over the protein sequence
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            
            # For some reason, some kmers miss. Thus these sequences have to be removed
            try:
                sequence.append(kmer_to_id[kmer])
            except:
                remove_idxs.append(i)
                
        if eos_token:
            sequence.append(kmer_to_id['<EOS>'])
            
        tokenized.append(sequence)
            
    df['tokenized_seqs'] = tokenized
    
    df.drop(remove_idxs, inplace=True)
    
    return df, vocab_sz, kmer_to_id, id_to_kmer

In [23]:
# Load the vocabolary from the Language Model
vocab_save_file = Path('data/interim/AA_vocab.pkl')
vocab_save_file = file_paths / vocab_save_file
vocab = pickle.load(open(vocab_save_file, 'rb'))

In [24]:
KMER_SIZE = 1 # Single Amino Acids

In [25]:
# Tokenize the protein sequence
df, vocab_sz, kmer_to_id, id_to_kmer = tokenize(df, 'Sequence', KMER_SIZE, premade_vocab=vocab)

In [26]:
df.head()

Unnamed: 0,Sequence,Location,tokenized_seqs
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Cytoplasm,"[0, 3, 17, 3, 5, 6, 14, 1, 14, 14, 1, 18, 8, 0..."
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Endosome,"[0, 17, 3, 17, 14, 10, 18, 14, 19, 2, 14, 14, ..."
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm,"[0, 12, 17, 14, 3, 14, 9, 13, 10, 12, 13, 12, ..."
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion,"[0, 4, 2, 9, 16, 4, 18, 4, 18, 16, 8, 4, 4, 4,..."
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Cell membrane,"[0, 4, 2, 2, 5, 17, 18, 5, 18, 16, 19, 8, 18, ..."


In [27]:
class EmbeddingDropout(torch.nn.Module):
    "Apply dropout to an Embedding with probability emp_p"

    def __init__(self, emb_p=0):
        super(EmbeddingDropout, self).__init__()
        
        self.emb_p = emb_p

    def forward(self, inp):

        bs, sl = inp.shape[:2]
        
        drop = torch.nn.Dropout(self.emb_p)
        placeholder = torch.ones((bs, sl, 1)).to(dev)
        mask = drop(placeholder)      
        out = inp * mask

        return out

In [42]:
class AWD_LSTM(torch.nn.Module):
    def __init__(self, num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p, weight_p, batch_sz = 1, pad_token=False):
        super(AWD_LSTM, self).__init__()
        
        # Embedding with dropout
        if pad_token:
            self.encoder = nn.Embedding(vocab_sz, emb_dim, padding_idx=pad_token)
        else:
            self.encoder = nn.Embedding(vocab_sz, emb_dim, padding_idx=pad_token)
            
        self.emb_drop = EmbeddingDropout(emb_p=embed_p)

        
        # Dropouts on the inputs and the hidden layers
        self.hid_dp = torch.nn.Dropout(p=hidden_p)
        
        self.lstms = nn.LSTM(emb_dim, hid_sz, num_layers, batch_first = True, dropout=input_p)

        
        # Save all variables        
        self.num_layers = num_layers
        self.vocab_sz = vocab_sz
        self.emb_dim = emb_dim
        self.hid_sz = hid_sz
        self.hidden_p = hidden_p
        self.embed_p = embed_p
        self.input_p = input_p
        self.weight_p = weight_p
        self.batch_sz = batch_sz

        # Initialize hidden layers        
        self.reset_hidden()
        self.last_hiddens = (self.hidden_state, self.cell_state)
                
    def forward(self, xs):
        """Forward pass AWD-LSTM""" 
        
        bs, sl = xs.shape
        
        # Because sequences consisting of only padding are removed from the mini-batch, the mini-batch alters
        # Therefore we have to adjust the hidden state for that
        if bs != self.last_hiddens[0].shape[1]:
            self._change_bs_hidden(bs)
        
        hidden_states = []
        
        hiddens = self.last_hiddens

        embed = self.encoder(xs)
        embed_dp = self.emb_drop(embed)

        
        inp = embed_dp.view(bs, sl, -1)
        
        # Dropout on hidden layers
        hiddens_dp = []
        for hidden_state in hiddens:
            hiddens_dp.append(self.hid_dp(hidden_state))
            
        hiddens_dp = tuple(hiddens_dp)
        
        output, (h, c) = self.lstms(embed_dp.view(bs, sl, -1), hiddens_dp)
        
        self.last_hiddens = (h.detach(), c.detach())
        
        return output, self.last_hiddens
    
    def reset_hidden(self):
        self.hidden_state = torch.zeros((self.num_layers, self.batch_sz, self.hid_sz)).to(dev)
        self.cell_state = torch.zeros((self.num_layers, self.batch_sz, self.hid_sz)).to(dev)
        self.last_hiddens = (self.hidden_state, self.cell_state)
    
    def _change_bs_hidden(self, bs):
        hidden_state = self.last_hiddens[0]
        cell_state = self.last_hiddens[1]
        
        if bs > hidden_state.shape[1]:
            self.batch_sz = bs
            self.reset_hidden()
        else:
            corr_hidden_state = hidden_state[:,:bs,:]
            corr_cell_state = cell_state[:,:bs,:]
        
            self.last_hiddens = (corr_hidden_state, corr_cell_state)
    
    def freeze_to(self , n):
        
        params_to_freeze = n * 4 + 1 # Since each LSTM layer has 4 parameters plus 1 to also freeze the encoder
        
        total_params = len(list(self.parameters()))
        
        for i, parameter in enumerate(self.parameters()):
            parameter.requires_grad = True
            
            if i < params_to_freeze:
                parameter.requires_grad = False
            
            
        for name, parameter in self.named_parameters():
            print(name)
            print(parameter.requires_grad)

In [43]:
# Hyperparameters
emb_dim = 10 # Embeddding dimension
hid_sz = 400 # Hidden size
num_layers = 20 # Number of LSTM layers stacked together
seq_len = num_layers
bs = 8

# Dropout parameters

embed_p = 0.1 # Dropout probability on the embedding
hidden_p = 0.3 # Dropout probability on hidden-to-hidden weight matrices
# Dropout tussen de inputs van de LSTMs moet ik er nog in bouwen
input_p = 0.3 # Dropout probablity on the LSTM input between LSTMS
weight_p = 0.5 # Dropout probability on LSTM-to-LSTM weight matrices

In [44]:
file_paths = Path('/home/mees/Desktop/Machine_Learning/subcellular_location')

In [45]:
pretrained_model = file_paths / Path('models/3_percent_single_AA_v1.pt')

In [46]:
pretrained_model

PosixPath('/home/mees/Desktop/Machine_Learning/subcellular_location/models/3_percent_single_AA_v1.pt')

In [47]:
pad_token = vocab_sz - 1

In [48]:
AWD_LSTM = AWD_LSTM(num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, 
                                embed_p, input_p, weight_p, batch_sz=bs, pad_token=pad_token)

In [50]:
AWD_LSTM = torch.load(pretrained_model, map_location=torch.device(dev))

UnpicklingError: NEWOBJ class argument isn't a type object