# Pytorch Amin Acid Language Model

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from pathlib import Path
import time
import pickle
from IPython.display import HTML, display

In [2]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu" 

In [3]:
dev

'cuda:0'

In [4]:
torch.manual_seed(42)

<torch._C.Generator at 0x7effcbacb8f0>

Add nice css for my table

In [5]:
HTML("""
<style>
table, th, td {
  border: 1px solid black;
}
</style>
""")

## Load the data

In [6]:
from google.colab import drive
from pathlib import Path


drive.mount('content/', force_remount=True)
base = Path('/content/content/My Drive/')

Mounted at content/


In [15]:
data_file = Path('/content/content/MyDrive/subcellular-location/v2/LM_data_2021-03-11.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Entry,Entry name,Sequence
0,P68307,NU3M_BALMU,MNLLLTLLTNTTLALLLVFIAFWLPQLNVYAEKTSPYECGFDPMGS...
1,P0CY61,O162_CONBU,MKLTCVLIIAVLFLTAITADDSRDKQVYRAVGLIDKMRRIRASEGC...
2,Q0VIL3,OTOMP_DANRE,MDLPGGHLAVVLFLFVLVSMSTENNIIRWCTVSDAEDQKCLDLAGN...
3,A1W9I4,NUSB_ACISJ,MTDSTHPTPSARPPRQPRTGTTGTGARKAGSKSGRSRAREFALQAL...
4,Q8DBX0,OMPU_VIBVU,MKKTLIALSVSAAAVATGVNAAELYNQDGTSLDMGGRAEARLSMKD...


In [16]:
df.drop(['Entry', 'Entry name'], axis = 1, inplace=True)
df.head()

Unnamed: 0,Sequence
0,MNLLLTLLTNTTLALLLVFIAFWLPQLNVYAEKTSPYECGFDPMGS...
1,MKLTCVLIIAVLFLTAITADDSRDKQVYRAVGLIDKMRRIRASEGC...
2,MDLPGGHLAVVLFLFVLVSMSTENNIIRWCTVSDAEDQKCLDLAGN...
3,MTDSTHPTPSARPPRQPRTGTTGTGARKAGSKSGRSRAREFALQAL...
4,MKKTLIALSVSAAAVATGVNAAELYNQDGTSLDMGGRAEARLSMKD...


## Tokenize the data

In [17]:
# Set-up numpy generator for random numbers
random_number_generator = np.random.default_rng(seed=42)
KMER_SIZE = 3

In [18]:
# Tokenize the protein sequence (or any sequence) in kmers.
def tokenize(protein_seqs, kmer_sz):
    kmers = set()
    # Loop over protein sequences
    for protein_seq in protein_seqs:
        # Loop over the whole sequence
        for i in range(len(protein_seq) - (kmer_sz - 1)):
            # Add kmers to the set, thus only unique kmers will remain
            kmers.add(protein_seq[i: i + kmer_sz])
            
    # Map kmers for one hot-encoding
    kmer_to_id = dict()
    id_to_kmer = dict()
    
    for ind, kmer in enumerate(kmers):
        kmer_to_id[kmer] = ind
        id_to_kmer[ind] = kmer
        
    vocab_sz = len(kmers)
    
    assert vocab_sz == len(kmer_to_id.keys())
    
    # Tokenize the protein sequence to integers
    tokenized = []
    for protein_seq in protein_seqs:
        sequence = []
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            sequence.append(kmer_to_id[kmer])
            
        tokenized.append(sequence)
            
    
    return tokenized, vocab_sz, kmer_to_id, id_to_kmer

In [19]:
# Tokenize the protein sequence
tokenized_seqs, vocab_sz, kmer_to_id, id_to_kmer = tokenize(df['Sequence'], KMER_SIZE)

In [20]:
vocab_sz

9317

In [21]:
tokenized_seqs[0][:10]

[1807, 4997, 1236, 1271, 292, 4018, 1271, 652, 9129, 7560]

In [22]:
data = []
for seq in tokenized_seqs:
    for kmer in seq:
        data.append(kmer)

## Dataset

In [8]:
class AminoLMDataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_len):
        self.data = torch.Tensor(data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        xs = torch.LongTensor(data[idx: idx + seq_len])
        targets = data[idx + 1: idx + seq_len + 1]

        ys = []

        for target in targets:
          y = torch.tensor(target)
          ys.append(y)

        ys = torch.stack(ys)

        ys = ys.to(dev)
        xs = xs.to(dev) 
    
        return xs, ys

## Building the LM Model

In [9]:
# Hyperparameters
emb_dim = 400 # Embeddding dimension
hid_sz = 1150 # Hidden size
num_layers = 3 # Number of LSTM layers stacked together
seq_len = num_layers
bs = 8

# Dropout parameters

embed_p = 0.1 # Dropout probability on the embedding
hidden_p = 0.3 # Dropout probability on hidden-to-hidden weight matrices
# Dropout tussen de inputs van de LSTMs moet ik er nog in bouwen
input_p = 0.3 # Dropout probablity on the LSTM input between LSTMS
weight_p = 0.5 # Dropout probability on LSTM-to-LSTM weight matrices

In [10]:
class EmbeddingDropout(torch.nn.Module):
    "Apply dropout to an Embedding with probability emp_p"

    def __init__(self, emb_p=0):
        super(EmbeddingDropout, self).__init__()
        
        self.emb_p = emb_p

    def forward(self, inp):
       
        drop = torch.nn.Dropout(self.emb_p)
        placeholder = torch.ones((inp.size(0), 1)).to(dev)
        mask = drop(placeholder)      
        out = inp * mask
        
        return out

In [11]:
class WeightDropout(torch.nn.Module):
  "Apply dropout to LSTM's hidden-hidden weights"
    
  def __init__(self, module, weight_p):
    super(WeightDropout, self).__init__()
    self.module = module
    self.weight_p = weight_p

    # Save the name of the layer weights in a list
    num_layers = module.num_layers
    layer_base_name = 'weight_hh_l'      
    self.layer_weights = [layer_base_name + str(i) for i in range(num_layers)]

    # Make a copy of the weights in weightname_raw
    for weight in self.layer_weights:

      w = getattr(self.module, weight)
      del module._parameters[weight]
      self.module.register_parameter(f'{weight}_raw', torch.nn.Parameter(w))

  def _setweights(self):
    "Apply dropout to the raw weights"
    for weight in self.layer_weights:
      raw_w = getattr(self.module, f'{weight}_raw')
      if self.training:
          w = torch.nn.functional.dropout(raw_w, p=self.weight_p)
      else:
          w = raw_w.clone()
      setattr(self.module, weight, w)
    
  def forward(self, *args):
    self._setweights()
    return self.module(*args)

In [38]:
class AWD_LSTM(torch.nn.Module):
    def __init__(self, num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p, weight_p, batch_sz = 1):
        super(AWD_LSTM, self).__init__()
        
        # Embedding with droput
        self.encoder = torch.nn.Embedding(vocab_sz, emb_dim)
        self.emb_drop = EmbeddingDropout(emb_p=embed_p)

        
        # Dropouts on the inputs and the hidden layers
        self.input_dp = torch.nn.Dropout(p=input_p)
        self.hid_dp = torch.nn.Dropout(p=hidden_p)

        # Create a list of lstm layers with wieghtdropout
        self.lstms = []
        for i in range(num_layers):
            self.lstms.append(
                WeightDropout(nn.LSTM(input_size=emb_dim, hidden_size=hid_sz, num_layers=1), weight_p))
        self.lstms = nn.ModuleList(self.lstms)

        # Save all variables        
        self.num_layers = num_layers
        self.vocab_sz = vocab_sz
        self.emb_dim = emb_dim
        self.hid_sz = hid_sz
        self.hidden_p = hidden_p
        self.embed_p = embed_p
        self.input_p = input_p
        self.weight_p = weight_p
        self.batch_sz = batch_sz

        # Initialize hidden layers        
        self.reset_hidden()
        self.last_hiddens = (self.hidden_state, self.cell_state)
                
    def forward(self, xs):
        """Forward pass AWD-LSTM""" 
        
        bs, sl = xs.shape

        ys = []
        
        hiddens = self.last_hiddens

        hidden_states = [hiddens]

        for i, lstm in enumerate(self.lstms):
            
          # Embed the input and add dropout to it  
          x = xs[:, i]
          embed = self.encoder(x)
          embed_dp = self.emb_drop(embed)
            
          # Again add dropout, this feels like doing dropout on dropout, I dont know if it is worth
          
          input_dp = self.input_dp(embed_dp)

          hiddens_dp = []

          for hidden_state in hidden_states[i]:
            hiddens_dp.append(self.hid_dp(hidden_state))

          hiddens_dp = tuple(hiddens_dp)
        
          output, hiddens = lstm(input_dp.view(1, bs, -1), hiddens_dp) 

          det_hiddens = []

          for hidden in hiddens:
            det_hiddens.append(hidden.detach())

          det_hiddens = tuple(det_hiddens)

          hidden_states.append(det_hiddens)
                 
          y = output.view(bs, 1, -1)

          ys.append(y)
        

        y = torch.stack(ys, dim=0)
        
        y = y.view(bs, sl, -1)
        
        self.last_hiddens = hidden_states[-1]
        
        return y
    
    def reset_hidden(self):
        self.hidden_state = torch.zeros((1, self.batch_sz, self.hid_sz)).to(dev)
        self.cell_state = torch.zeros((1, self.batch_sz, self.hid_sz)).to(dev)
        self.last_hiddens = (self.hidden_state, self.cell_state)
    
    def freeze_to(self , n):
        
        params_to_freeze = n * 4 + 1 # Since each LSTM layer has 4 parameters plus 1 to also freeze the encoder
        
        total_params = len(list(self.parameters()))
        
        for i, parameter in enumerate(self.parameters()):
            parameter.requires_grad = True
            
            if i < params_to_freeze:
                parameter.requires_grad = False
            
            
        for name, parameter in self.named_parameters():
            print(name)
            print(parameter.requires_grad)

In [39]:
class ProteinLM(torch.nn.Module):
    def __init__(self, num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p, weight_p, batch_sz = 1):
        super(ProteinLM, self).__init__()
        
        self.encoder = AWD_LSTM(num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, 
                                embed_p, input_p, weight_p, batch_sz=batch_sz)
        self.decoder = torch.nn.Linear(hid_sz, vocab_sz)
        
    def forward(self, inp):
        
        encoded = self.encoder(inp)
        
        y = self.decoder(encoded)
        
        return y 
    
    def freeze_to(self, n):
        self.encoder.freeze_to(n)
        
    def reset_hidden(self):
        self.encoder.reset_hidden()

## Create AWD_LSTM model

In [40]:
model = ProteinLM(num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p, weight_p, batch_sz=bs)
model = model.to(dev)
model

ProteinLM(
  (encoder): AWD_LSTM(
    (encoder): Embedding(9317, 400)
    (emb_drop): EmbeddingDropout()
    (input_dp): Dropout(p=0.3, inplace=False)
    (hid_dp): Dropout(p=0.3, inplace=False)
    (lstms): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(400, 1150)
      )
    )
  )
  (decoder): Linear(in_features=1150, out_features=9317, bias=True)
)

## Training the model

In [None]:
training_set = AminoLMDataset(data, seq_len)

In [None]:
training_loader = torch.utils.data.DataLoader(training_set, batch_size=bs, shuffle=True)

In [None]:
total_train_len = len(training_loader)
total_train_len

7307669

In [None]:
# Hyperparameters
learning_rate = 0.01
epochs = 10

In [None]:
# Costfunction and optimize algorithm
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

In [None]:
# Test for the real work
for i, entry in enumerate(training_loader, 0):
    xs, ys = entry[0], entry[1]

    print('Input shape:')
    print(xs.shape)
    
    print(xs)

    outputs = model(xs)
    
    bs, sl = outputs.shape[:2]
    
    # Flatten the output
    outputs = outputs.view(bs * sl, -1)
    
    print(ys)
    
    # Flatten the label
    ys = ys.view(-1)

    print(outputs.shape)
    print(ys.shape)
    
    print(ys)
    
    loss = criterion(outputs, ys)
    print(loss)
    
    break

Input shape:
torch.Size([8, 3])
tensor([[9177, 1821, 1754],
        [3252, 3514,  224],
        [5261,  219, 8915],
        [2685, 1015, 2529],
        [7030, 3768, 8896],
        [3638, 3098, 3232],
        [4479, 9233, 3546],
        [3705, 6753, 6496]], device='cuda:0')
tensor([[1821, 1754, 6118],
        [3514,  224, 7205],
        [ 219, 8915, 4586],
        [1015, 2529, 4387],
        [3768, 8896, 2401],
        [3098, 3232, 1238],
        [9233, 3546, 4743],
        [6753, 6496, 6892]], device='cuda:0')
torch.Size([24, 9317])
torch.Size([24])
tensor([1821, 1754, 6118, 3514,  224, 7205,  219, 8915, 4586, 1015, 2529, 4387,
        3768, 8896, 2401, 3098, 3232, 1238, 9233, 3546, 4743, 6753, 6496, 6892],
       device='cuda:0')
tensor(9.1292, device='cuda:0', grad_fn=<NllLossBackward>)


  self.dropout, self.training, self.bidirectional, self.batch_first)


In [None]:
display(HTML(
    """<table>
        <thead>
          <tr>
          <th>Epoch</th>
          <th>Percentage</th>
          <th>Loss</th>
          <th>Time</th>
          </tr>
        </thead>
        <tbody>
        """
))

for epoch in range(epochs):
    
    start_time = time.time()

    model.reset_hidden()
    
    # Initialize loss at 0
    epoch_loss = 0.0
    
    # Iterations (in between epoch) loss
    iteration_loss = 0.0

    for i, entry in enumerate(training_loader, 0):
        
        
        model.zero_grad()
        
        xs, ys = entry[0], entry[1]
        
        outputs = model(xs)
        
        bs, sl = outputs.shape[:2]
    
        # Flatten the output
        outputs = outputs.view(bs * sl, -1)

        # Flatten the label
        ys = ys.view(-1)

        loss = criterion(outputs, ys)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += outputs.shape[0] * loss.item()
        iteration_loss += outputs.shape[0] * loss.item()
        
        if i % 1e4 == 0:

            round_time = time.time()
            duration = round(((round_time - start_time) / 60), 0) # To convert to minutes
            
            perc = round((i / total_train_len * 100), 2)

            iteration_loss = round((iteration_loss / 1e4), 2)

            display(HTML(
            """<tr>
              <td>{}</td>
              <td>{}</td>
              <td>{}</td>
              <td>{}</td>
              </tr>""".format(str(epoch + 1), str(perc), str(iteration_loss), str(duration))
            ))

            iteration_loss = 0.0
    
    epoch_loss /= total_train_len
    loss_history.append(epoch_loss)
    
    print(f'Epoch {str(epoch + 1)} Train loss: {str(epoch_loss)}.')

display(HTML('</tbody></table>'))

print('Finished training')

Epoch,Percentage,Loss,Time


  self.dropout, self.training, self.bidirectional, self.batch_first)


KeyboardInterrupt: ignored

## Save Model for Training Later

In [35]:
filename = '1_percent_AA_LM_v3.pt'
file_dir = Path('/content/content/MyDrive/subcellular-location/v2/' + filename)
file_dir

PosixPath('/content/content/MyDrive/subcellular-location/v2/1_percent_AA_LM_v3.pt')

In [None]:
model.encoder

AWD_LSTM(
  (encoder): Embedding(9317, 400)
  (emb_drop): EmbeddingDropout()
  (input_dp): Dropout(p=0.3, inplace=False)
  (hid_dp): Dropout(p=0.3, inplace=False)
  (lstms): ModuleList(
    (0): WeightDropout(
      (module): LSTM(400, 1150)
    )
    (1): WeightDropout(
      (module): LSTM(400, 1150)
    )
    (2): WeightDropout(
      (module): LSTM(400, 1150)
    )
  )
)

In [None]:
torch.save(model.encoder, file_dir)

## Load Model for Further Training

In [41]:
model_path = Path('/content/content/MyDrive/subcellular-location/v2/1_percent_AA_LM_v3.pt')
model.encoder = torch.load(model_path)
model

ProteinLM(
  (encoder): AWD_LSTM(
    (encoder): Embedding(9317, 400)
    (emb_drop): EmbeddingDropout()
    (input_dp): Dropout(p=0.3, inplace=False)
    (hid_dp): Dropout(p=0.3, inplace=False)
    (lstms): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (1): WeightDropout(
        (module): LSTM(400, 1150)
      )
      (2): WeightDropout(
        (module): LSTM(400, 1150)
      )
    )
  )
  (decoder): Linear(in_features=1150, out_features=9317, bias=True)
)

In [42]:
# Freeze the first LSTM layer and encoder
model.freeze_to(1)

encoder.weight
False
lstms.0.module.weight_ih_l0
False
lstms.0.module.bias_ih_l0
False
lstms.0.module.bias_hh_l0
False
lstms.0.module.weight_hh_l0_raw
False
lstms.1.module.weight_ih_l0
True
lstms.1.module.bias_ih_l0
True
lstms.1.module.bias_hh_l0
True
lstms.1.module.weight_hh_l0_raw
True
lstms.2.module.weight_ih_l0
True
lstms.2.module.bias_ih_l0
True
lstms.2.module.bias_hh_l0
True
lstms.2.module.weight_hh_l0_raw
True


### Train Further with Data of which the location is known

In [43]:
# Tokenize the protein sequence (or any sequence) in kmers.
def tokenize(df, protein_seqs_column, kmer_sz, premade_vocab=False):
    
    if not premade_vocab:
        kmers = set()
        # Loop over protein sequences
        for protein_seq in df[protein_seqs_column]:
            # Loop over the whole sequence
            for i in range(len(protein_seq) - (kmer_sz - 1)):
                # Add kmers to the set, thus only unique kmers will remain
                kmers.add(protein_seq[i: i + kmer_sz])

        # Map kmers for one hot-encoding
        kmer_to_id = dict()
        id_to_kmer = dict()

        for ind, kmer in enumerate(kmers):
            kmer_to_id[kmer] = ind
            id_to_kmer[ind] = kmer

        vocab_sz = len(kmers)

        assert vocab_sz == len(kmer_to_id.keys())
    
    else:
        kmer_to_id, id_to_kmer = premade_vocab
        vocab_sz = len(kmer_to_id)
    
    # Tokenize the protein sequence to integers
    tokenized = []
    for i, protein_seq in enumerate(df[protein_seqs_column], 0):
        sequence = []
        
        # If the kmer can't be found these indexes should be deleted
        remove_idxs = []
        
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            
            # For some reason, some kmers miss. Thus these sequences have to be removed
            try:
                sequence.append(kmer_to_id[kmer])
            except:
                remove_idxs.append(i)
            
        tokenized.append(sequence)
            
    df['tokenized_seqs'] = tokenized
    
    df.drop(remove_idxs, inplace=True)
    
    return df, vocab_sz, kmer_to_id, id_to_kmer

In [44]:
data_file = Path('/content/content/MyDrive/subcellular-location/v2/protein_data_2021-04-04.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Sequence,Subcellular location [CC],Location
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec...",Cytoplasm
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...,Endosome
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,...",Cytoplasm
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...,Mitochondrion
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...,Cell membrane


In [45]:
df.drop(['Subcellular location [CC]'], axis = 1, inplace=True)
df.head()

Unnamed: 0,Sequence,Location
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Cytoplasm
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Endosome
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Cell membrane


In [46]:
len(df)

16614

Data should be tokenized with the same vocab as for the other vocabulary.

In [47]:
# Load the vocabolary from the Language Model
vocab_save_file = '/content/content/MyDrive/subcellular-location/v2/LM_vocab.pkl'
vocab = pickle.load(open(vocab_save_file, 'rb'))

In [48]:
# Tokenize the protein sequence
df, vocab_sz, kmer_to_id, id_to_kmer = tokenize(df, 'Sequence', KMER_SIZE, vocab)

In [49]:
df.head(5)

Unnamed: 0,Sequence,Location,tokenized_seqs
0,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,Cytoplasm,"[3884, 8570, 3840, 6832, 2277, 2221, 1020, 904..."
1,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,Endosome,"[8772, 7207, 1857, 1688, 5461, 3901, 4899, 424..."
2,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,Cytoplasm,"[1565, 3797, 2513, 516, 1428, 6558, 6568, 7337..."
3,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,Mitochondrion,"[8939, 2538, 9262, 4438, 2547, 302, 60, 3064, ..."
4,MALLVDRVRGHWRIAAGLLFNLLVSICIVFLNKWIYVYHGFPNMSL...,Cell membrane,"[8939, 6897, 6013, 1021, 3034, 2863, 8501, 697..."


In [50]:
df.dropna(inplace=True)
len(df)

16614

In [51]:
data = []
for seq in df['tokenized_seqs']:
    for kmer in seq:
        data.append(kmer)

### Train with the new data

In [52]:
training_set = AminoLMDataset(data, seq_len)

In [53]:
training_loader = torch.utils.data.DataLoader(training_set, batch_size=bs, shuffle=False)

In [54]:
total_train_len = len(training_loader)
total_train_len

1202020

In [58]:
# Hyperparameters
learning_rate = 0.01
epochs = 3

In [59]:
# Costfunction and optimize algorithm
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

In [60]:
display(HTML(
    """<table>
        <thead>
          <tr>
          <th>Epoch</th>
          <th>Percentage</th>
          <th>Loss</th>
          <th>Time</th>
          </tr>
        </thead>
        <tbody>
        """
))

for epoch in range(epochs):
    
  start_time = time.time()

  model.reset_hidden()
  
  # Initialize loss at 0
  epoch_loss = 0.0
  iteration_loss = 0.0
  
  for i, entry in enumerate(training_loader, 0):
      
     

      model.zero_grad()
        
      xs, ys = entry[0], entry[1]
      
      outputs = model(xs)
      
      bs, sl = outputs.shape[:2]
  
      # Flatten the output
      outputs = outputs.view(bs * sl, -1)

      # Flatten the label
      ys = ys.view(-1)

      loss = criterion(outputs, ys)
      
      loss.backward()
      optimizer.step()
      
      epoch_loss += outputs.shape[0] * loss.item()
      iteration_loss += outputs.shape[0] * loss.item()
      
      
      if i % 1.5e4 == 0:
          
          round_time = time.time()
          duration = round(((round_time - start_time) / 60), 0) # To convert to minutes
          start_time = time.time()
          
          perc = round((i / total_train_len * 100), 2)

          iteration_loss = round((iteration_loss / 1.5e4), 2)

          display(HTML(
          """<tr>
            <td>{}</td>
            <td>{}</td>
            <td>{}</td>
            <td>{}</td>
            </tr>""".format(str(epoch + 1), str(perc), str(iteration_loss), str(duration))
          ))

          iteration_loss = 0.0
  
  loss_history.append(epoch_loss)
  
  print(f'Epoch {str(epoch + 1)} Train loss: {str(epoch_loss)}.')

display(HTML('</tbody></table>'))        
print('Finished training')

Epoch,Percentage,Loss,Time


  self.dropout, self.training, self.bidirectional, self.batch_first)


RuntimeError: ignored

In [61]:
filename = 'AA_LM_v3_ph1.pt'
file_dir = Path('/content/content/MyDrive/subcellular-location/v2/' + filename)
file_dir

PosixPath('/content/content/MyDrive/subcellular-location/v2/AA_LM_v3_ph1.pt')

In [62]:
torch.save(model.encoder, file_dir)

Train one loop without unfreezing the first LSTM layer

In [63]:
model.freeze_to(0)

encoder.weight
False
lstms.0.module.weight_ih_l0
True
lstms.0.module.bias_ih_l0
True
lstms.0.module.bias_hh_l0
True
lstms.0.module.weight_hh_l0_raw
True
lstms.1.module.weight_ih_l0
True
lstms.1.module.bias_ih_l0
True
lstms.1.module.bias_hh_l0
True
lstms.1.module.weight_hh_l0_raw
True
lstms.2.module.weight_ih_l0
True
lstms.2.module.bias_ih_l0
True
lstms.2.module.bias_hh_l0
True
lstms.2.module.weight_hh_l0_raw
True


In [64]:
epochs = 1

In [65]:
display(HTML(
    """<table>
        <thead>
          <tr>
          <th>Epoch</th>
          <th>Percentage</th>
          <th>Loss</th>
          <th>Time</th>
          </tr>
        </thead>
        <tbody>
        """
))

for epoch in range(epochs):
    
  start_time = time.time()

  model.reset_hidden()
  
  # Initialize loss at 0
  epoch_loss = 0.0
  iteration_loss = 0.0
  
  for i, entry in enumerate(training_loader, 0):
      
      if len(entry[0]) == bs:

        model.zero_grad()
          
        xs, ys = entry[0], entry[1]
        
        outputs = model(xs)
        
        bs, sl = outputs.shape[:2]
    
        # Flatten the output
        outputs = outputs.view(bs * sl, -1)

        # Flatten the label
        ys = ys.view(-1)

        loss = criterion(outputs, ys)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += outputs.shape[0] * loss.item()
        iteration_loss += outputs.shape[0] * loss.item()
        
        
        if i % 1.5e4 == 0:
            
            round_time = time.time()
            duration = round(((round_time - start_time) / 60), 0) # To convert to minutes
            start_time = time.time()
            
            perc = round((i / total_train_len * 100), 2)

            iteration_loss = round((iteration_loss / 1.5e4), 2)

            display(HTML(
            """<tr>
              <td>{}</td>
              <td>{}</td>
              <td>{}</td>
              <td>{}</td>
              </tr>""".format(str(epoch + 1), str(perc), str(iteration_loss), str(duration))
            ))

            iteration_loss = 0.0
  
  loss_history.append(epoch_loss)
  
  print(f'Epoch {str(epoch + 1)} Train loss: {str(epoch_loss)}.')

display(HTML('</tbody></table>'))        
print('Finished training')

Epoch,Percentage,Loss,Time


  self.dropout, self.training, self.bidirectional, self.batch_first)


RuntimeError: ignored

In [66]:
filename = 'AA_LM_v3_ph2.pt'
file_dir = Path('/content/content/MyDrive/subcellular-location/v2/' + filename)
file_dir

PosixPath('/content/content/MyDrive/subcellular-location/v2/AA_LM_v3_ph2.pt')

In [67]:
torch.save(model.encoder, file_dir)

> https://arxiv.org/pdf/1801.06146.pdf

## Testing AWD-LSTM output with FASTAI

num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p, weight_p, batch_sz = 1

In [None]:
tst = AWD_LSTM(2, 100, 20, 10, 0.2, 0.02, 0.1, 0.2)

In [None]:
x = torch.randint(0, 100, (10,5)) # Die 10 en 5 zijn bs en sl
x = torch.randint(0, 100, (1,3))
r = tst(x)


print(tst.last_hiddens[0].shape)

tst.eval()
tst.reset_hidden()
tst(x);
tst(x);


print(r.shape)

torch.Size([1, 1, 10])
torch.Size([2, 10])


In [None]:
# Test for the real work
for i, entry in enumerate(training_loader, 0):
    xs, ys = entry[0], entry[1]
    
    print('Input shape:')
    print(xs.shape)
    print(xs)

    outputs = model(xs.squeeze(0))

    print(outputs.shape)
    print(ys.shape)

    loss = criterion(outputs, ys.squeeze(0))
    print(loss)
    
    break

Input shape:
torch.Size([1, 3])
tensor([[3721,  850, 8386]])


RuntimeError: input.size(-1) must be equal to input_size. Expected 400, got 160000