# Aminoacid Language Model

First make a language model, then make it into a classifier.

In [1]:
import torch
from pathlib import Path
import pandas as pd
import numpy as np

In the PyTorch docs: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

We can see that the hidden-to-gidden weight are saved in this manner:
>~LSTM.weight_hh_l[k] – the learnable hidden-hidden weights of the kth\text{k}^{th}kth layer (W_hi|W_hf|W_hg|W_ho), of shape (4*hidden_size, hidden_size)

We can use this to apply WeightDropout or DropConnect to the LSTM layers.
Also using https://pytorchnlp.readthedocs.io/en/latest/_modules/torchnlp/nn/weight_drop.html and FastAI docs https://github.com/fastai/fastai/blob/45376f13df04ddf72749be25ae8a6dff35859f68/fastai/text/models/awdlstm.py as inspiration.

Also look at github.com/salesforce/awd-lstm-lm/blob/master/ for inspiration

## Load the Data

In [2]:
data_file = Path('/home/mees/Desktop/Machine_Learning/subcellular_location/data/raw/LM_data_2021-03-11.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Entry,Entry name,Sequence
0,P68307,NU3M_BALMU,MNLLLTLLTNTTLALLLVFIAFWLPQLNVYAEKTSPYECGFDPMGS...
1,P0CY61,O162_CONBU,MKLTCVLIIAVLFLTAITADDSRDKQVYRAVGLIDKMRRIRASEGC...
2,Q0VIL3,OTOMP_DANRE,MDLPGGHLAVVLFLFVLVSMSTENNIIRWCTVSDAEDQKCLDLAGN...
3,A1W9I4,NUSB_ACISJ,MTDSTHPTPSARPPRQPRTGTTGTGARKAGSKSGRSRAREFALQAL...
4,Q8DBX0,OMPU_VIBVU,MKKTLIALSVSAAAVATGVNAAELYNQDGTSLDMGGRAEARLSMKD...


We create a language model, the location is thus not used.

In [3]:
df.drop(['Entry', 'Entry name'], axis = 1, inplace=True)
df.head()

Unnamed: 0,Sequence
0,MNLLLTLLTNTTLALLLVFIAFWLPQLNVYAEKTSPYECGFDPMGS...
1,MKLTCVLIIAVLFLTAITADDSRDKQVYRAVGLIDKMRRIRASEGC...
2,MDLPGGHLAVVLFLFVLVSMSTENNIIRWCTVSDAEDQKCLDLAGN...
3,MTDSTHPTPSARPPRQPRTGTTGTGARKAGSKSGRSRAREFALQAL...
4,MKKTLIALSVSAAAVATGVNAAELYNQDGTSLDMGGRAEARLSMKD...


## Tokenize the Data

In [4]:
# Set-up numpy generator for random numbers
random_number_generator = np.random.default_rng(seed=42)
KMER_SIZE = 3

In [5]:
# Tokenize the protein sequence (or any sequence) in kmers.
def tokenize(protein_seqs, kmer_sz):
    kmers = set()
    # Loop over protein sequences
    for protein_seq in protein_seqs:
        # Loop over the whole sequence
        for i in range(len(protein_seq) - (kmer_sz - 1)):
            # Add kmers to the set, thus only unique kmers will remain
            kmers.add(protein_seq[i: i + kmer_sz])
            
    # Map kmers for one hot-encoding
    kmer_to_id = dict()
    id_to_kmer = dict()
    
    for ind, kmer in enumerate(kmers):
        kmer_to_id[kmer] = ind
        id_to_kmer[ind] = kmer
        
    vocab_sz = len(kmers)
    
    assert vocab_sz == len(kmer_to_id.keys())
    
    # Tokenize the protein sequence to integers
    tokenized = []
    for protein_seq in protein_seqs:
        sequence = []
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            sequence.append(kmer_to_id[kmer])
            
        tokenized.append(sequence)
            
    
    return tokenized, vocab_sz, kmer_to_id, id_to_kmer

In [6]:
# Tokenize the protein sequence
tokenized_seqs, vocab_sz, kmer_to_id, id_to_kmer = tokenize(df['Sequence'], KMER_SIZE)

In [7]:
vocab_sz

9317

In [8]:
tokenized_seqs[0][:10]

[5066, 7805, 3604, 7672, 271, 4409, 7672, 726, 4136, 3819]

For the language model, I create one corpus of the aminoacids of each sequences after each other.

In [9]:
data = []
for seq in tokenized_seqs:
    for kmer in seq:
        data.append(kmer)

## Dataset

In [10]:
class AminoLMDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = torch.Tensor(data)
        self.vocab_sz = vocab_sz
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        x = data[idx]
        y = data[idx + 1]
    
        return x, y

## Hyperparameters

In [11]:
emb_dim = 400 # Embeddding dimension
hid_sz = 1150 # Hidden size
num_layers = 3 # Number of LSTM layers stacked together
seq_len = num_layers

In [12]:
embed_p = 0.1 # Dropout probability on the embedding
hidden_p = 0.3 # Dropout probability on hidden-to-hidden weight matrices
input_p = 0.3 # Dropout probablity on the LSTM input between LSTMS

# This one still has to be implemented
#weight_p = 0.5 # Dropout probability on LSTM-to-LSTM weight matrices

## Weight Dropout

In [13]:
class WeightDropout(torch.nn.Module):
    "Apply dropout to LSTM's hidden-hidden weights"
    
    def __init__(self, module, weight_p):
        super(WeightDropout, self).__init__()
        self.module = module
        self.weight_p = weight_p
        
        # Save the name of the layer weights in a list
        num_layers = module.num_layers
        layer_base_name = 'weight_hh_l'      
        self.layer_weights = [layer_base_name + str(i) for i in range(num_layers)]
        
        # Make a copy of the weights in weightname_raw
        for weight in self.layer_weights:
            w = getattr(self.module, weight)
            del module._parameters[weight]
            self.module.register_parameter(f'{weight}_raw', torch.nn.Parameter(w))
            
        def _setweights(self):
            "Apply dropout to the raw weights"
            for weight in self.layer_weights:
                raw_w = getattr(self, f'{weight}_raw')
                if self.training:
                    w = torch.nn.F(raw_w, p=self.weight_p)
                else:
                    w = raw_w.clone()
                setattr(self.module, weight, w)
                
        def forward(self, *args):
            self._setweights()
            return self.module(*args)   

## LSTM layer

Voor initialisatie kan ik hier nog naar kijken
https://discuss.pytorch.org/t/how-are-layer-weights-and-biases-initialized-by-default/13073/4

In [33]:
class AWD_LSTM(torch.nn.Module):
    def __init__(self, num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p):
        super(AWD_LSTM, self).__init__()
        
        self.encoder = torch.nn.Embedding(vocab_sz, emb_dim)
        self.emb_drop = torch.nn.Dropout(p=embed_p)
        
        self.lstm = torch.nn.LSTM(input_size = emb_dim, hidden_size = hid_sz, num_layers = num_layers, dropout = input_p)
        
        self.decoder = torch.nn.Linear(hid_sz, vocab_sz)
        
        self.num_layers = num_layers
        self.vocab_sz = vocab_sz
        self.emb_dim = emb_dim
        self.hid_sz = hid_sz
        self.hidden_p = hidden_p
        self.embed_p = embed_p
        self.input_p = input_p
        self.batch_sz = 1
        
        self.reset_hidden()
                
    def forward(self, x):
        """Forward pass AWD-LSTM""" 

        embed = self.encoder(x)
            
        output, (self.hidden_state, self.cell_state) = self.lstm(embed.view(len(x), 1, -1),
                                                                (self.hidden_state, self.cell_state))    
            
        y = self.decoder(output)
        
        self.hidden_state.detach_()
        self.cell_state.detach_()
        
        return y
    
    def reset_hidden(self):
        self.hidden_state = torch.zeros((self.num_layers, self.batch_sz, self.hid_sz))
        self.cell_state = torch.zeros((self.num_layers, self.batch_sz, self.hid_sz))

## Build the Language Model

In [34]:
model = AWD_LSTM(num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p)

In [35]:
model

AWD_LSTM(
  (encoder): Embedding(9317, 400)
  (emb_drop): Dropout(p=0.1, inplace=False)
  (lstm): LSTM(400, 1150, num_layers=3, dropout=0.3)
  (decoder): Linear(in_features=1150, out_features=9317, bias=True)
)

## Train the Language Model

In [36]:
training_set = AminoLMDataset(data)

In [37]:
training_loader = torch.utils.data.DataLoader(training_set, batch_size=1, shuffle=False)

In [38]:
total_train_len = len(training_loader)

In [39]:
for i, entry in enumerate(training_loader, 0):
    x, y = entry
    
    output = model.forward(x)
    
    break

In [40]:
# Hyperparameters
learning_rate = 0.01
epochs = 1

In [41]:
# Costfunction and optimize algorithm
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

In [42]:
loss_history = []

In [43]:
for epoch in range(epochs):
    
    model.reset_hidden()
    
    # Initialize loss at 0
    epoch_loss = 0.0
    
    print(f'Epoch: {str(epoch + 1)}')
    
    for i, entry in enumerate(training_loader, 0):
        
        
        model.zero_grad()
        
        x, y = entry
        
        output = model(x)
        loss = criterion(output[0], y)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if i % 1e4 == 0:
            perc = i / total_train_len * 100
            print(str(perc))
    
    loss_history.append(epoch_loss)
    
    print(f'Epoch {str(epoch + 1)} Train loss: {str(epoch_loss)}.')
          
print('Finished training')

Epoch: 1
0.0


KeyboardInterrupt: 

## Additional (old stuff)

In [44]:
model = torch.nn.Sequential(
    torch.nn.Embedding(vocab_sz, emb_dim),
    torch.nn.Dropout(p=0.1),
    LSTM(num_layers, emb_dim, hid_sz, hidden_p),
    torch.nn.Linear(emb_dim, vocab_sz)
)

In [45]:
model

Sequential(
  (0): Embedding(9317, 400)
  (1): Dropout(p=0.1, inplace=False)
  (2): LSTM()
  (3): Linear(in_features=400, out_features=9317, bias=True)
)