# AWD-LSTM in Pytorch

Creating a AWD-LSTM neural network using PyTorch. The network is based on https://arxiv.org/pdf/1708.02182.pdf

## First Create a Language Model to Predict Amino Acid Sequences

In [2]:
import torch
from pathlib import Path
import pandas as pd
import numpy as np

In the PyTorch docs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

We can see that the hidden-to-gidden weight are saved in this manner:
>~LSTM.weight_hh_l[k] – the learnable hidden-hidden weights of the kth\text{k}^{th}kth layer (W_hi|W_hf|W_hg|W_ho), of shape (4*hidden_size, hidden_size)

We can use this to apply WeightDropout or DropConnect to the LSTM layers.
Also using https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/weight_drop.html and FastAI docs https://github.com/fastai/fastai/blob/45376f13df04ddf72749be25ae8a6dff35859f68/fastai/text/models/awdlstm.py as inspiration.

## Load the Data

In [3]:
data_file = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/processed/protein_data_2021-02-16.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Sequence,Subcellular location [CC]
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Membrane
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Endosome
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,pass membrane protein


The first step is creating the language model, therefore, the location is not used.

In [4]:
del df['Subcellular location [CC]']
df.head()

Unnamed: 0,Sequence
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...


## Tokenize the Data

In [5]:
# Set-up numpy generator for random numbers
random_number_generator = np.random.default_rng()

In [6]:
# Tokenize the protein sequence (or any sequence) in kmers.
def tokenize(protein_seqs, kmer_sz):
    kmers = set()
    # Loop over protein sequences
    for protein_seq in protein_seqs:
        # Loop over the whole sequence
        for i in range(len(protein_seq) - (kmer_sz - 1)):
            # Add kmers to the set, thus only unique kmers will remain
            kmers.add(protein_seq[i: i + kmer_sz])
            
    # Map kmers for one hot-encoding
    kmer_to_id = dict()
    id_to_kmer = dict()
    
    for ind, kmer in enumerate(kmers):
        kmer_to_id[kmer] = ind
        id_to_kmer[ind] = kmer
        
    vocab_sz = len(kmers)
    
    assert vocab_sz == len(kmer_to_id.keys())
    
    # Tokenize the protein sequence to integers
    tokenized = []
    for protein_seq in protein_seqs:
        sequence = []
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            sequence.append(kmer_to_id[kmer])
            
        tokenized.append(sequence)
            
    
    return tokenized, vocab_sz, kmer_to_id, id_to_kmer

In [7]:
KMER_SIZE = 3

In [8]:
# Tokenize the protein sequence
tokenized_seqs, vocab_sz, kmer_to_id, id_to_kmer = tokenize(df['Sequence'], KMER_SIZE)

In [9]:
vocab_sz

8071

## Build a DataSet

In [19]:
class AminoSequenceDataset(torch.utils.data.Dataset):
    r""" A custom dataset for amino acid sequences.
    """
    def __init__(self, amino_sequences):
        self.amino_sequences = amino_sequences
        
    def __len__(self):
        return len(self.amino_sequences)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        sample = self.amino_sequences[idx]
        
        return torch.tensor(sample)

In [20]:
data = AminoSequenceDataset(tokenized_seqs)

In [21]:
data[1]

tensor([ 840, 1363, 3132,  ...,  172,  154, 6794])

## Build the Language Model

In [13]:
class WeightDropout(torch.nn.Module):
    "Apply dropout to LSTM's hidden-hidden weights"
    
    def __init__(self, module, weight_p):
        super(WeightDropout, self).__init__()
        self.module = module
        self.weight_p = weight_p
        
        # Save the name of the layer weights in a list
        num_layers = module.num_layers
        layer_base_name = 'weight_hh_l'      
        self.layer_weights = [layer_base_name + str(i) for i in range(num_layers)]
        
        # Make a copy of the weights in weightname_raw
        for weight in self.layer_weights:
            w = getattr(self.module, weight)
            del module._parameters[weight]
            self.module.register_parameter(f'{weight}_raw', torch.nn.Parameter(w))
            
        def _setweights(self):
            "Apply dropout to the raw weights"
            for weight in self.layer_weights:
                raw_w = getattr(self, f'{weight}_raw')
                if self.training:
                    w = torch.nn.F(raw_w, p=self.weight_p)
                else:
                    w = raw_w.clone()
                setattr(self.module, weight, w)
                
        def forward(self, *args):
            self._setweights()
            return self.module(*args)   
            

In [14]:
emb_dim = 400 # Embeddding dimension
hid_sz = 1150 # Hidden size
num_layers = 3 # Number of LSTM layers stacked together

In [15]:
embed_p = 0.1 # Dropout probability on the embedding
hidden_p = 0.3 # Dropout probability on hidden-to-hidden weight matrices
input_p = 0.3 # Dropout probablity on the LSTM input between LSTMS

# This one still has to be implemented
#weight_p = 0.5 # Dropout probability on LSTM-to-LSTM weight matrices

In [16]:
model = torch.nn.Sequential(
    torch.nn.Embedding(vocab_sz, emb_dim),
    torch.nn.Dropout(p=0.1),
    WeightDropout(
        torch.nn.LSTM(input_size = emb_dim, hidden_size = hid_sz, num_layers = num_layers, dropout=input_p),
        hidden_p
    ),
    torch.nn.Linear(emb_dim, vocab_sz)
)

In [17]:
model

Sequential(
  (0): Embedding(8071, 400)
  (1): Dropout(p=0.1, inplace=False)
  (2): WeightDropout(
    (module): LSTM(400, 1150, num_layers=3, dropout=0.3)
  )
  (3): Linear(in_features=400, out_features=8071, bias=True)
)

## Train the Language Model

NotImplementedError: 