# Pytorch Amin Acid Language Model

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from pathlib import Path

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu" 

In [None]:
dev

'cuda:0'

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f3d3a315810>

## Load the data

In [None]:
data_file = Path('/content/LM_data_2021-03-11.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0,Entry,Entry name,Sequence
0,P68307,NU3M_BALMU,MNLLLTLLTNTTLALLLVFIAFWLPQLNVYAEKTSPYECGFDPMGS...
1,P0CY61,O162_CONBU,MKLTCVLIIAVLFLTAITADDSRDKQVYRAVGLIDKMRRIRASEGC...
2,Q0VIL3,OTOMP_DANRE,MDLPGGHLAVVLFLFVLVSMSTENNIIRWCTVSDAEDQKCLDLAGN...
3,A1W9I4,NUSB_ACISJ,MTDSTHPTPSARPPRQPRTGTTGTGARKAGSKSGRSRAREFALQAL...
4,Q8DBX0,OMPU_VIBVU,MKKTLIALSVSAAAVATGVNAAELYNQDGTSLDMGGRAEARLSMKD...


In [None]:
df.drop(['Entry', 'Entry name'], axis = 1, inplace=True)
df.head()

Unnamed: 0,Sequence
0,MNLLLTLLTNTTLALLLVFIAFWLPQLNVYAEKTSPYECGFDPMGS...
1,MKLTCVLIIAVLFLTAITADDSRDKQVYRAVGLIDKMRRIRASEGC...
2,MDLPGGHLAVVLFLFVLVSMSTENNIIRWCTVSDAEDQKCLDLAGN...
3,MTDSTHPTPSARPPRQPRTGTTGTGARKAGSKSGRSRAREFALQAL...
4,MKKTLIALSVSAAAVATGVNAAELYNQDGTSLDMGGRAEARLSMKD...


## Tokenize the data

In [None]:
# Set-up numpy generator for random numbers
random_number_generator = np.random.default_rng(seed=42)
KMER_SIZE = 3

In [None]:
# Tokenize the protein sequence (or any sequence) in kmers.
def tokenize(protein_seqs, kmer_sz):
    kmers = set()
    # Loop over protein sequences
    for protein_seq in protein_seqs:
        # Loop over the whole sequence
        for i in range(len(protein_seq) - (kmer_sz - 1)):
            # Add kmers to the set, thus only unique kmers will remain
            kmers.add(protein_seq[i: i + kmer_sz])
            
    # Map kmers for one hot-encoding
    kmer_to_id = dict()
    id_to_kmer = dict()
    
    for ind, kmer in enumerate(kmers):
        kmer_to_id[kmer] = ind
        id_to_kmer[ind] = kmer
        
    vocab_sz = len(kmers)
    
    assert vocab_sz == len(kmer_to_id.keys())
    
    # Tokenize the protein sequence to integers
    tokenized = []
    for protein_seq in protein_seqs:
        sequence = []
        for i in  range(len(protein_seq) - (kmer_sz -1)):
            # Convert kmer to integer
            kmer = protein_seq[i: i + kmer_sz]
            sequence.append(kmer_to_id[kmer])
            
        tokenized.append(sequence)
            
    
    return tokenized, vocab_sz, kmer_to_id, id_to_kmer

In [None]:
# Tokenize the protein sequence
tokenized_seqs, vocab_sz, kmer_to_id, id_to_kmer = tokenize(df['Sequence'], KMER_SIZE)

In [None]:
vocab_sz

9317

In [None]:
tokenized_seqs[0][:10]

[6613, 1030, 1596, 2497, 7681, 6098, 2497, 2622, 614, 6238]

In [None]:
data = []
for seq in tokenized_seqs:
    for kmer in seq:
        data.append(kmer)

## Dataset

In [None]:
class AminoLMDataset(torch.utils.data.Dataset):
    def __init__(self, data, seq_len):
        self.data = torch.Tensor(data)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        xs = torch.LongTensor(data[idx: idx + seq_len])
        targets = data[idx + 1: idx + seq_len + 1]

        ys = []

        for target in targets:
          y = torch.tensor(target)
          ys.append(y)

        ys = torch.stack(ys)

        ys = ys.to(dev)
        xs = xs.to(dev) 
    
        return xs, ys

NameError: ignored

## Building the LM Model

In [None]:
# Hyperparameters
emb_dim = 400 # Embeddding dimension
hid_sz = 1150 # Hidden size
num_layers = 3 # Number of LSTM layers stacked together
seq_len = num_layers

# Dropout parameters

embed_p = 0.1 # Dropout probability on the embedding
hidden_p = 0.3 # Dropout probability on hidden-to-hidden weight matrices
input_p = 0.3 # Dropout probablity on the LSTM input between LSTMS

# This one still has to be implemented
#weight_p = 0.5 # Dropout probability on LSTM-to-LSTM weight matrices

In [None]:
class AWD_LSTM(torch.nn.Module):
    def __init__(self, num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p):
        super(AWD_LSTM, self).__init__()
        
        self.encoder = torch.nn.Embedding(vocab_sz, emb_dim)
        self.emb_drop = torch.nn.Dropout(p=embed_p)

        self.lstms = []

        for i in range(num_layers):
            self.lstms.append(nn.LSTM(input_size=emb_dim, hidden_size=hid_sz, num_layers=1))

        self.lstms = nn.ModuleList(self.lstms)

        self.decoder = torch.nn.Linear(hid_sz, vocab_sz)
        
        self.num_layers = num_layers
        self.vocab_sz = vocab_sz
        self.emb_dim = emb_dim
        self.hid_sz = hid_sz
        self.hidden_p = hidden_p
        self.embed_p = embed_p
        self.input_p = input_p
        self.batch_sz = 1
        
        self.reset_hidden()
                
    def forward(self, xs):
        """Forward pass AWD-LSTM""" 

        ys = []


        self.reset_hidden()
        
        hiddens = (self.hidden_state, self.cell_state)

        for i, lstm in enumerate(self.lstms):
          embed = self.encoder(xs[i])
          output, hiddens = lstm(embed.view(1, 1, -1), hiddens)  
            
          y = self.decoder(output)

          # Outputs moeten squeezen om de loss toe te kunnen passen, maar moet nog even kijken of de juiste waardes dan wel worden mee genomen

          ys.append(y.squeeze(0).squeeze(0))

        y = torch.stack(ys, dim=0)
        
        return y
    
    def reset_hidden(self):
        self.hidden_state = torch.zeros((1, self.batch_sz, self.hid_sz)).to(dev)
        self.cell_state = torch.zeros((1, self.batch_sz, self.hid_sz)).to(dev)

In [None]:
model = AWD_LSTM(num_layers, vocab_sz, emb_dim, hid_sz, hidden_p, embed_p, input_p)
model = model.to(dev)
model

AWD_LSTM(
  (encoder): Embedding(9317, 400)
  (emb_drop): Dropout(p=0.1, inplace=False)
  (lstms): ModuleList(
    (0): LSTM(400, 1150)
    (1): LSTM(400, 1150)
    (2): LSTM(400, 1150)
  )
  (decoder): Linear(in_features=1150, out_features=9317, bias=True)
)

## Training the model

In [None]:
training_set = AminoLMDataset(data, seq_len)

In [None]:
training_loader = torch.utils.data.DataLoader(training_set, batch_size=1, shuffle=False)

In [None]:
total_train_len = len(training_loader)
total_train_len

58461351

In [None]:
# Test for the real work
for i, entry in enumerate(training_loader, 0):
    xs, ys = entry[0], entry[1]

    outputs = model(xs.squeeze(0))

    print(outputs.shape)
    print(ys.shape)

    loss = criterion(outputs, ys.squeeze(0))
    print(loss)
    
    break

torch.Size([3, 9317])
torch.Size([1, 3])
tensor(9.1061, device='cuda:0', grad_fn=<NllLossBackward>)


In [None]:
# Hyperparameters
learning_rate = 0.01
epochs = 1

In [None]:
# Costfunction and optimize algorithm
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

In [None]:
for epoch in range(epochs):
    
    model.reset_hidden()
    
    # Initialize loss at 0
    epoch_loss = 0.0
    
    print(f'Epoch: {str(epoch + 1)}')
    
    for i, entry in enumerate(training_loader, 0):
        
        
        model.zero_grad()
        
        xs, ys = entry[0], entry[1]
        
        outputs = model(xs.squeeze(0))
        loss = criterion(outputs, ys.squeeze(0))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if i % 1e4 == 0:
            perc = i / total_train_len * 100
            print(f'Percent: {str(perc)}')
            print(f'Loss: {str(loss.item())}')
    
    loss_history.append(epoch_loss)
    
    print(f'Epoch {str(epoch + 1)} Train loss: {str(epoch_loss)}.')
          
print('Finished training')

Epoch: 1
Percent: 0.0
Loss: 8.85924243927002
Percent: 0.01710531800744735
Loss: 8.926836967468262
Percent: 0.0342106360148947
Loss: 9.091572761535645
Percent: 0.05131595402234205
Loss: 8.935885429382324
Percent: 0.0684212720297894
Loss: 9.17000961303711
Percent: 0.08552659003723674
Loss: 8.62395191192627
Percent: 0.1026319080446841
Loss: 8.910038948059082
Percent: 0.11973722605213143
Loss: 8.816948890686035
Percent: 0.1368425440595788
Loss: 7.374332427978516
Percent: 0.15394786206702613
Loss: 9.096847534179688
Percent: 0.17105318007447348
Loss: 8.81326961517334
Percent: 0.1881584980819208
Loss: 7.947908401489258
Percent: 0.2052638160893682
Loss: 8.776854515075684
Percent: 0.2223691340968155
Loss: 8.7290678024292
Percent: 0.23947445210426285
Loss: 9.786087989807129
Percent: 0.25657977011171024
Loss: 8.355191230773926
Percent: 0.2736850881191576
Loss: 8.411930084228516
Percent: 0.2907904061266049
Loss: 7.8287811279296875
Percent: 0.30789572413405225
Loss: 7.402301788330078
Percent: 0.325

KeyboardInterrupt: ignored

## Save Model for Training Later

In [None]:
from google.colab import drive
from pathlib import Path


drive.mount('content/', force_remount=True)
base = Path('/content/content/My Drive/')


Mounted at content/


In [None]:
filename = '1_percent_AA_LM.pt'
file_dir = Path('/content/content/MyDrive/' + filename)
file_dir

PosixPath('/content/content/MyDrive/1_percent_AA_LM.pt')

In [None]:
torch.save(model, file_dir)

## Load Model for Further Training

In [None]:
from google.colab import drive
from pathlib import Path


drive.mount('content/', force_remount=True)
base = Path('/content/content/My Drive/')

Mounted at content/


In [None]:
model_path = Path('/content/content/MyDrive/1_percent_AA_LM.pt')
model = torch.load(model_path)
model

AWD_LSTM(
  (encoder): Embedding(9317, 400)
  (emb_drop): Dropout(p=0.1, inplace=False)
  (lstms): ModuleList(
    (0): LSTM(400, 1150)
    (1): LSTM(400, 1150)
    (2): LSTM(400, 1150)
  )
  (decoder): Linear(in_features=1150, out_features=9317, bias=True)
)

### Train Further with Data of which the location is known

In [None]:
data_file = Path('/content/protein_data_2021-02-07.csv')
df = pd.read_csv(data_file, sep=';')
df.head()

Unnamed: 0.1,Unnamed: 0,Entry,Entry name,Protein names,Gene names,Sequence,Subcellular location [CC]
0,0,O95825,QORL1_HUMAN,Quinone oxidoreductase-like protein 1 (EC 1.-....,CRYZL1 4P11,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...,
1,1,Q9Y2J0,RP3A_HUMAN,Rabphilin-3A (Exophilin-1),RPH3A KIAA0985,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...,"SUBCELLULAR LOCATION: Cytoplasmic vesicle, sec..."
2,2,Q13905,RPGF1_HUMAN,Rap guanine nucleotide exchange factor 1 (CRK ...,RAPGEF1 GRF2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...,SUBCELLULAR LOCATION: Early endosome {ECO:0000...
3,3,Q5TD94,RSH4A_HUMAN,Radial spoke head protein 4 homolog A (Radial ...,RSPH4A RSHL3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...,"SUBCELLULAR LOCATION: Cytoplasm, cytoskeleton,..."
4,4,Q9HA92,RSAD1_HUMAN,Radical S-adenosyl methionine domain-containin...,RSAD1,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...,SUBCELLULAR LOCATION: Mitochondrion {ECO:00003...


In [None]:
df.drop(['Unnamed: 0', 'Entry', 'Entry name', 'Protein names', 'Gene names', 'Subcellular location [CC]'], axis = 1, inplace=True)
df.head()

Unnamed: 0,Sequence
0,MKGLYFQQSSTDEEITFVFQEKEDLPVTEDNFVKLQVKACALSQIN...
1,MTDTVFSNSSNRWMYPSDRPLQSNDKEQLQAGWSVHPGGQPDRQRK...
2,MDTDSQRSHLSSFTMKLMDKFHSPKIKRTPSKKGKPAEVSVKIPEK...
3,MEDSTSPKQEKENQEELGETRRPWEGKTAASPQYSEPESSEPLEAK...
4,MALPGARARGWAAAARAAQRRRRVENAGGSPSPEPAGRRAALYVHW...


In [None]:
len(df)

20394

Deze data moet hetzelfde getokeniseerd worden als de hele corpus, daarom moet ik die kmer_to_id opslaan en die gebruiken.

In [None]:
# Tokenize the protein sequence to integers
tokenized = []
for i, protein_seq in enumerate(df['Sequence']):
    sequence = []
    for i in  range(len(protein_seq) - (KMER_SIZE -1)):
        # Convert kmer to integer
        kmer = protein_seq[i: i + KMER_SIZE]
        
        try:
          sequence.append(kmer_to_id[kmer])
        except:
          # If the KMER is for some unknown reason not in the vocab then delete the row from the dataframe
          df.drop(i, inplace=True)

    tokenized.append(sequence)

In [None]:
len(df)

20386

In [None]:
tokenized[0][:10]

[157, 8770, 6108, 4111, 1747, 2328, 4276, 8642, 6792, 9194]

In [None]:
data = []
for seq in tokenized:
    for kmer in seq:
        data.append(kmer)

### Train with the new data

In [None]:
training_set = AminoLMDataset(data, seq_len)

In [None]:
training_loader = torch.utils.data.DataLoader(training_set, batch_size=1, shuffle=False)

In [None]:
total_train_len = len(training_loader)
total_train_len

11323425

In [None]:
# Hyperparameters
learning_rate = 0.01
epochs = 1

In [None]:
# Costfunction and optimize algorithm
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate)

In [None]:
for epoch in range(epochs):
    
    model.reset_hidden()
    
    # Initialize loss at 0
    epoch_loss = 0.0
    
    print(f'Epoch: {str(epoch + 1)}')
    
    for i, entry in enumerate(training_loader, 0):
        
        
        model.zero_grad()
        
        xs, ys = entry[0], entry[1]
        
        outputs = model(xs.squeeze(0))
        loss = criterion(outputs, ys.squeeze(0))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if i % 1.5e4 == 0:
            perc = i / total_train_len * 100
            print(f'Percent: {str(perc)}')
            print(f'Loss: {str(loss.item())}')
    
    loss_history.append(epoch_loss)
    
    print(f'Epoch {str(epoch + 1)} Train loss: {str(epoch_loss)}.')
          
print('Finished training')

Epoch: 1
Percent: 0.0
Loss: 11.375401496887207
Percent: 0.13246875393266613
Loss: 8.15804672241211
Percent: 0.26493750786533227
Loss: 9.12777328491211
Percent: 0.3974062617979984
Loss: 9.250300407409668
Percent: 0.5298750157306645
Loss: 9.036314964294434
Percent: 0.6623437696633306
Loss: 9.138776779174805
Percent: 0.7948125235959967
Loss: 5.583368301391602
Percent: 0.927281277528663
Loss: 4.445772171020508
Percent: 1.059750031461329
Loss: 8.011649131774902
Percent: 1.1922187853939952
Loss: 3.1863505840301514
Percent: 1.3246875393266613
Loss: 2.9561500549316406
Percent: 1.4571562932593274
Loss: 7.1134114265441895
Percent: 1.5896250471919935
Loss: 8.157109260559082
Percent: 1.7220938011246596
Loss: 6.495886325836182
Percent: 1.854562555057326
Loss: 2.9495487213134766
Percent: 1.9870313089899923
Loss: 7.4272003173828125
Percent: 2.119500062922658
Loss: 7.234080791473389
Percent: 2.2519688168553245
Loss: 3.666826009750366
Percent: 2.3844375707879903
Loss: 6.497460842132568
Percent: 2.51690

KeyboardInterrupt: ignored

In [None]:
filename = 'AA_LM_v1.pt'
file_dir = Path('/content/content/MyDrive/' + filename)
file_dir

PosixPath('/content/content/MyDrive/AA_LM_v1.pt')

In [None]:
torch.save(model, file_dir)

## Weightdropout (voor later)

In [None]:
class WeightDropout(torch.nn.Module):
    "Apply dropout to LSTM's hidden-hidden weights"
    
    def __init__(self, module, weight_p):
        super(WeightDropout, self).__init__()
        self.module = module
        self.weight_p = weight_p
        
        # Save the name of the layer weights in a list
        num_layers = module.num_layers
        layer_base_name = 'weight_hh_l'      
        self.layer_weights = [layer_base_name + str(i) for i in range(num_layers)]
        
        # Make a copy of the weights in weightname_raw
        for weight in self.layer_weights:
            w = getattr(self.module, weight)
            del module._parameters[weight]
            self.module.register_parameter(f'{weight}_raw', torch.nn.Parameter(w))
            
        def _setweights(self):
            "Apply dropout to the raw weights"
            for weight in self.layer_weights:
                raw_w = getattr(self, f'{weight}_raw')
                if self.training:
                    w = torch.nn.F(raw_w, p=self.weight_p)
                else:
                    w = raw_w.clone()
                setattr(self.module, weight, w)
                
        def forward(self, *args):
            self._setweights()
            return self.module(*args)