### Learning of compressed latent representation with autoencoder

In [6]:
import os

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary

import numpy as np
import pandas as pd

In [7]:
model_dir = './Models/'

if not os.path.exists(model_dir):
    os.mkdir(model_dir)

#### Model architecture

In [2]:
# Most of the successful architectures were based on tanh activations only.

class Autoencoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.Tanh(),
            nn.Linear(1024, latent_size),
            nn.Tanh()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, 1024),
            nn.Tanh(),
            nn.Linear(1024, input_size),
            nn.Tanh()
        )

    def forward(self, x):
        encoded = self.encoder[0](x)
        activation1 = self.encoder[1](encoded)
        encoded = self.encoder[2](activation1)
        activation2 = self.encoder[3](encoded)

        decoded = self.decoder[0](activation2)
        activation3 = self.decoder[1](decoded)
        decoded = self.decoder[2](activation3)
        return decoded, activation1, activation2, activation3


In [4]:
# Random seed for reproducibility.
torch.manual_seed(42)

# Fetch mean kinase domain embeddings and create a TensorDataset.
distinct_domains_df = pd.read_parquet('./Data/distinct_kinase_domain_embeddings.parquet')
linearized_mean_embeddings = np.array(distinct_domains_df['Linearized Mean Embedding'].tolist())

mean_padded_tensor = torch.FloatTensor(linearized_mean_embeddings)

dataset = TensorDataset(mean_padded_tensor, mean_padded_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

input_size = linearized_mean_embeddings[0].shape[0]

#### Model training

In [None]:
# Optimization parameters and training loop.

model = Autoencoder(input_size, 512)
criterion = nn.MSELoss()

optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

# Optional: monitoring training process in tensorboard.
# writer = SummaryWriter(log_dir='./logs/autoencoder')

num_epochs = 1000
for epoch in range(num_epochs):
    for i, (inputs, _) in enumerate(dataloader):
        outputs, act1, act2, act3 = model(inputs)
        loss = criterion(outputs, inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log activations every 10 batches.
        # if i % 10 == 0:
        #     writer.add_histogram('Activations/encoder_layer1', act1, epoch * len(dataloader) + i)
        #     writer.add_histogram('Activations/encoder_layer2', act2, epoch * len(dataloader) + i)
        #     writer.add_histogram('Activations/decoder_layer1', act3, epoch * len(dataloader) + i)

    scheduler.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

#writer.close()
torch.save(model.state_dict(), './Models/autoencoder_512_dims_tanh')

In [5]:
small_model = Autoencoder(input_size, 512)
large_model = Autoencoder(input_size, 768)

# Load the trained model weights
small_model.load_state_dict(torch.load('./Models/autoencoder_512_dims_tanh'))
large_model.load_state_dict(torch.load('./Models/autoencoder_768_dims_tanh'))

small_model.eval()
large_model.eval()

with torch.no_grad():
    latent_small = small_model.encoder(mean_padded_tensor)
    latent_large = large_model.encoder(mean_padded_tensor)

# Convert latent tensors to NumPy arrays for further use
latent_small_np = latent_small.cpu().numpy()
latent_large_np = latent_large.cpu().numpy()

distinct_domains_df['Latent Small Embedding'] = latent_small_np.tolist()
distinct_domains_df['Latent Large Embedding'] = latent_large_np.tolist()

print(f"Latent representation from small model shape: {latent_small_np.shape}")
print(f"Latent representation from large model shape: {latent_large_np.shape}")

Latent representation from small model shape: (586, 512)
Latent representation from large model shape: (586, 768)


In [14]:
# Store the learned embeddings.
distinct_domains_df['Latent Small Embedding'] = latent_small_np.tolist()
distinct_domains_df['Latent Large Embedding'] = latent_large_np.tolist()

distinct_domains_df.to_parquet('./Data/distinct_kinase_domain_embeddings_with_latents.parquet')