In [62]:
import torch.nn.functional as F
import torch.nn as nn
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from sklearn.preprocessing import StandardScaler


from utils.file_utils import open_json


MIN_NUM_MUTATIONS = 4

#2 Importing the dataset
df = pd.read_csv("./dataset/main_dataset.csv")
features = open_json("./dataset/features.json")

# create X,y:
use_features = {"identification": False, "basics": True, "blosum": True,
                "demask": True, "protein_analysis": True, 
                "deltas_protein_analysis": True, 
                "3D_structure_analysis": True, "dssp_3D_analysis": True,
                "target": False}

feature_columns = [features[key] for key in features if use_features[key]]
feature_columns = sum(feature_columns, []) # flatten the list of columns

X = df[feature_columns]
y = df[["ddG"]].values.astype(float)

print(f"there are {X.isna().sum().sum()} na occurences in the X df")
print(f"there are {np.isnan(y).sum()} na occurences in the y df")

# y
X.iloc[1,:3]

there are 0 na occurences in the X df
there are 0 na occurences in the y df


wild_aa_int          11.0
mutated_aa_int        3.0
mutated_chain_int    65.0
Name: 1, dtype: float64

In [63]:
# we add an idx based on the alphafold path (which is unique) to the X df
# this is in order to be able to choose a mutation from a random protein each time
X_ids = pd.DataFrame()
alphafold_paths = df["alphafold_path"].unique()
alphafold_to_id = {_path: i for i, _path in enumerate(alphafold_paths)}
index_by_alphafold_id = {i: [] for i in range(len(alphafold_paths))}

for idx, row in enumerate(df["alphafold_path"]):
    index_by_alphafold_id[alphafold_to_id[row]].append(idx)

alphafold_ids = [k for k in index_by_alphafold_id.keys()
                 if len(index_by_alphafold_id[k])>MIN_NUM_MUTATIONS]
print(len(alphafold_ids))
print(alphafold_ids)


133
[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61, 63, 64, 65, 66, 67, 69, 70, 71, 72, 78, 79, 80, 83, 84, 85, 86, 87, 89, 90, 91, 92, 93, 97, 99, 100, 101, 104, 105, 108, 110, 111, 115, 122, 127, 128, 134, 135, 137, 138, 139, 140, 141, 146, 148, 149, 156, 157, 162, 165, 166, 167, 169, 170, 173, 176, 187, 190, 192, 193, 194, 195, 197, 205, 206, 207, 209, 211, 212, 213, 218, 219, 221, 222, 224, 228, 242, 243, 244, 245]


In [64]:
class NovozymesDataset(torch.utils.data.Dataset):
    '''
    Prepare the Novozymes dataset for regression
    '''

    def __init__(self, X, y, index_by_alphafold_id, alphafold_ids, scale_data=True):
        self.scaler = StandardScaler()
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            # Apply scaling if necessary
            if scale_data:
                X = self.scaler.fit_transform(X)
            self.X = torch.from_numpy(X)
            self.y = torch.from_numpy(y)
        
        self.index_by_alphafold_id = index_by_alphafold_id
        self.alphafold_ids = alphafold_ids
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        # we make sure to use all available protein 
        k = random.choice(self.alphafold_ids)
        v = random.choice(self.index_by_alphafold_id[k])
        return self.X[v], self.y[v]


dataset = NovozymesDataset(X, y, index_by_alphafold_id, alphafold_ids)
trainloader = torch.utils.data.DataLoader(
    dataset, batch_size=10, shuffle=False, num_workers=4)


In [65]:
num_features = len(feature_columns)

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        '''
        Forward pass
        '''
        return self.layers(x)


In [66]:
# Initialize the MLP
net = Model()

# Define the loss function and optimizer
loss_function = nn.L1Loss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-4)


In [67]:
# Run the training loop
for epoch in range(0, 50): # 5 epochs at maximum
  
  # Print epoch
  print(f'Starting epoch {epoch+1}')
  
  # Set current loss value
  current_loss = 0.0
  
  # Iterate over the DataLoader for training data
  for i, data in enumerate(trainloader, 0):
    
    # Get and prepare inputs
    inputs, targets = data
    inputs, targets = inputs.float(), targets.float()
    targets = targets.reshape((targets.shape[0], 1))
    
    # Zero the gradients
    optimizer.zero_grad()
    
    # Perform forward pass
    outputs = net(inputs)
    
    # Compute loss
    loss = loss_function(outputs, targets)
    
    # Perform backward pass
    loss.backward()
    
    # Perform optimization
    optimizer.step()
    
    # Print statistics
    current_loss += loss.item()
    if i % 10 == 0:
        print('Loss after mini-batch %5d: %.3f' %
              (i + 1, current_loss / 500))
        current_loss = 0.0

# Process is complete.
print('Training process has finished.')

Starting epoch 1
Loss after mini-batch     1: 0.006
Loss after mini-batch    11: 0.036
Loss after mini-batch    21: 0.027
Loss after mini-batch    31: 0.033
Loss after mini-batch    41: 0.036
Loss after mini-batch    51: 0.035
Loss after mini-batch    61: 0.030
Loss after mini-batch    71: 0.038
Loss after mini-batch    81: 0.036
Loss after mini-batch    91: 0.032
Loss after mini-batch   101: 0.036
Loss after mini-batch   111: 0.029
Loss after mini-batch   121: 0.032
Loss after mini-batch   131: 0.036
Loss after mini-batch   141: 0.035
Loss after mini-batch   151: 0.032
Loss after mini-batch   161: 0.035
Loss after mini-batch   171: 0.032
Loss after mini-batch   181: 0.028
Loss after mini-batch   191: 0.034
Loss after mini-batch   201: 0.032
Loss after mini-batch   211: 0.027
Loss after mini-batch   221: 0.033
Loss after mini-batch   231: 0.032
Loss after mini-batch   241: 0.032
Loss after mini-batch   251: 0.031
Loss after mini-batch   261: 0.032
Loss after mini-batch   271: 0.031
Los

# Predicting on submission

In [73]:
# loading processed_test, ie. test.csv with added infos
df = pd.read_csv("./dataset/processed_test.csv")

# clean dataset:
df = df.drop(columns=features["identification"], axis=1)

# using the same use_features as in training:
feature_columns = [features[key] for key in features if use_features[key]]
feature_columns = sum(feature_columns, []) # flatten the list of columns

X_test = df[feature_columns]

print(f"there are {X_test.isna().sum().sum()} na occurences in the X df")

print(X_test.iloc[:5,:3])

there are 0 na occurences in the X df
   wild_aa_int  mutated_aa_int  mutated_chain_int
0           10               4                 65
1           10               9                 65
2           10               0                 65
3            9               2                 65
4            9               5                 65


In [74]:
X_test = dataset.scaler.transform(X_test)
X_test = torch.from_numpy(X_test).float()
X_test[1, :3]

tensor([-0.0267,  0.1624, -0.1279])

In [70]:
results = net(X_test)
results[:10]

tensor([[ 1.2491],
        [ 1.0880],
        [-3.0677],
        [ 1.2054],
        [ 0.7446],
        [ 1.7095],
        [ 1.9236],
        [ 0.7040],
        [ 0.3120],
        [ 0.4015]], grad_fn=<SliceBackward0>)

In [75]:
submission = pd.DataFrame(columns=["seq_id", "tm"])
submission["seq_id"] = df["seq_id"]
submission["tm"] = results.detach().numpy() * -1
submission.head()

Unnamed: 0,seq_id,tm
0,31390,-1.249115
1,31391,-1.088004
2,31392,3.067706
3,31393,-1.205403
4,31394,-0.744568


In [76]:
from datetime import datetime
date_time = datetime.now()
timestamp = date_time.strftime("%Y-%m-%d_%H-%M-%S")
submission.to_csv(f"./submission/submission_{timestamp}.csv", index=False)
