# AWD-LSTM in Pytorch

Creating a AWD-LSTM neural network using PyTorch. The network is based on https://arxiv.org/pdf/1708.02182.pdf

In [2]:
import torch
from pathlib import Path
import pandas as pd

In the PyTorch docs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

We can see that the hidden-to-gidden weight are saved in this manner:
>~LSTM.weight_hh_l[k] – the learnable hidden-hidden weights of the kth\text{k}^{th}kth layer (W_hi|W_hf|W_hg|W_ho), of shape (4*hidden_size, hidden_size)

We can use this to apply WeightDropout or DropConnect to the LSTM layers.
Also using https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/weight_drop.html and FastAI docs https://github.com/fastai/fastai/blob/45376f13df04ddf72749be25ae8a6dff35859f68/fastai/text/models/awdlstm.py as inspiration.

In [3]:
class WeightDropout(torch.nn.Module):
    "Apply dropout to LSTM's hidden-hidden weights"
    
    def __init__(self, module, weight_p):
        super(WeightDropout, self).__init__()
        self.module = module
        self.weight_p = weight_p
        
        # Save the name of the layer weights in a list
        num_layers = module.num_layers
        layer_base_name = 'weight_hh_l'      
        self.layer_weights = [layer_base_name + str(i) for i in range(num_layers)]
        
        # Make a copy of the weights in weightname_raw
        for weight in self.layer_weights:
            w = getattr(self.module, weight)
            del module._parameters[weight]
            self.module.register_parameter(f'{weight}_raw', torch.nn.Parameter(w))
            
        def _setweights(self):
            "Apply dropout to the raw weights"
            for weight in self.layer_weights:
                raw_w = getattr(self, f'{weight}_raw')
                if self.training:
                    w = torch.nn.F(raw_w, p=self.weight_p)
                else:
                    w = raw_w.clone()
                setattr(self.module, weight, w)
                
        def forward(self, *args):
            self._setweights()
            return self.module(*args)   
            

In [4]:
vocab_sz = 10
emb_dim = 400 # Embeddding dimension
hid_sz = 1150 # Hidden size
num_layers = 3 # Number of LSTM layers stacked together

In [5]:
embed_p = 0.1 # Dropout probability on the embedding
hidden_p = 0.3 # Dropout probability on hidden-to-hidden weight matrices
input_p = 0.3 # Dropout probablity on the LSTM input between LSTMS

# This one still has to be implemented
#weight_p = 0.5 # Dropout probability on LSTM-to-LSTM weight matrices

In [6]:
model = torch.nn.Sequential(
    torch.nn.Embedding(vocab_sz, emb_dim),
    torch.nn.Dropout(p=0.1),
    WeightDropout(
        torch.nn.LSTM(input_size = emb_dim, hidden_size = hid_sz, num_layers = num_layers, dropout=input_p),
        hidden_p
    ),
    torch.nn.Linear(emb_dim, vocab_sz)
)

In [7]:
model

Sequential(
  (0): Embedding(10, 400)
  (1): Dropout(p=0.1, inplace=False)
  (2): WeightDropout(
    (module): LSTM(400, 1150, num_layers=3, dropout=0.3)
  )
  (3): Linear(in_features=400, out_features=10, bias=True)
)

Open data in pandas dataframe

In [8]:
data_file = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/processed/protein_data_2021-02-16.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Sequence,Subcellular location [CC]
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Cytoplasmic vesicle
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Early endosome
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Membrane


In [9]:
classes = set(df['Subcellular location [CC]'])
classes

{'Apical cell membrane',
 'Basolateral cell membrane',
 'Cell junction',
 'Cell membrane',
 'Cell projection',
 'Cell surface',
 'Chromosome',
 'Cytoplasm',
 'Cytoplasmic granule',
 'Cytoplasmic granule lumen',
 'Cytoplasmic granule membrane',
 'Cytoplasmic vesicle',
 'Cytoplasmic vesicle membrane',
 'Early endosome',
 'Early endosome membrane',
 'Endomembrane system',
 'Endoplasmic reticulum',
 'Endoplasmic reticulum lumen',
 'Endoplasmic reticulum membrane',
 'Endosome',
 'Endosome membrane',
 'Extracellular vesicle membrane',
 'Golgi apparatus',
 'Golgi apparatus lumen',
 'Golgi apparatus membrane',
 'Isoform 1',
 'Isoform 2',
 'Late endosome',
 'Late endosome membrane',
 'Lateral cell membrane',
 'Lipid droplet',
 'Lysosome',
 'Lysosome lumen',
 'Lysosome membrane',
 'Melanosome',
 'Melanosome membrane',
 'Membrane',
 'Membrane raft',
 'Microsome',
 'Microsome membrane',
 'Midbody',
 'Mitochondrion',
 'Mitochondrion inner membrane',
 'Mitochondrion intermembrane space',
 'Mitochond

In [10]:
pretty_classes = set()
for _class in list(classes):
    if 'cell membrane' in _class:
        pretty_classes.add('Cell membrane')
    elif 'cytoplasmic granule' in _class:
        pretty_classes.add('cytoplasmic granule')
    elif 'endosome' in _class.lower():
        pretty_classes.add('Endosome')
    elif 'Golgi apparatus' in _class:
        pretty_classes.add('Golgi apparatus')
    elif 'Late endosome' in _class:
        pretty_classes.add('Late endosome')
    elif 'Lysosome' in _class:
        pretty_classes.add('Lysosome')
    elif 'Melanosome' in _class:
        pretty_classes.add('Melanosome')
    elif 'Mitochondrien' in _class:
        pretty_classes.add('Mitochondrien')
    elif 'Nucleus' in _class:
        pretty_classes.add('Nucleus')
    elif 'Peroxisome' in _class:
        pretty_classes.add('Peroxisome')
    elif 'Photoreceptor' in _class:
        pretty_classes.add('Photoreceptor')
    elif 'Preautophagosomal' in _class:
        pretty_classes.add('Preautophagosomal')
    elif 'Rough endoplasmic reticulum' in _class:
        pretty_classes.add('Rough endoplasmic reticulum')
    elif 'Sarcoplasmic reticulum' in _class:
        pretty_classes.add('Sarcoplasmic reticulum')
    else:
        pretty_classes.add(_class)

In [12]:
len(pretty_classes)

50