In [1]:
import pandas as pd

data = pd.read_csv('../data/CycPeptMPDB_Peptide_All.csv', low_memory=False)
print(f"Initial number of rows: {len(data)}")

data = data.drop_duplicates(subset='Structurally_Unique_ID')
print(f"Number of rows after dropping duplicate molecules: {len(data)}")

Initial number of rows: 8466
Number of rows after dropping duplicate molecules: 7991


In [2]:
# Remove columns with any missing values
data = data.dropna(axis=1)
print(f"Number of columns after dropping those with missing values: {data.shape[1]}")
print(f"Columns remaining: {data.columns.tolist()}")

# Remove Permeabiltiy = -10
data = data[data['Permeability'] != -10]
print(f"Number of rows after removing Permeability = -10: {len(data)}")


Number of columns after dropping those with missing values: 226
Columns remaining: ['ID', 'Source', 'Year', 'Version', 'Original_Name_in_Source_Literature', 'Structurally_Unique_ID', 'SMILES', 'HELM', 'HELM_URL', 'Sequence', 'Sequence_LogP', 'Sequence_TPSA', 'Monomer_Length', 'Monomer_Length_in_Main_Chain', 'Molecule_Shape', 'Permeability', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', '

In [3]:
target = data['Permeability']

# Remove the target column from the features
features = data.drop(columns=['Permeability'])

# Remove Identifier columns
identifier_columns = ['ID', 'Source', 'Year', 'Version', 'Original_Name_in_Source_Literature', 'Structurally_Unique_ID']
features = features.drop(columns=identifier_columns)

# Various peptide identifiers
identifier_columns = ['SMILES', 'HELM', 'HELM_URL', 'Sequence']
features = features.drop(columns=identifier_columns)

# TPSA and Sequence_LogP_Avg already exist in the dataset so these are not needed
already_existing_columns = ['Sequence_TPSA', 'Sequence_LogP']
features = features.drop(columns=already_existing_columns)

# No documentaion for these columns, so dropping them
undocumented_columns = ['PC1', 'PC2']
features = features.drop(columns=undocumented_columns)

print(f"Final number of features: {features.shape[1]}")
print(f'Remaining features: {features.columns.tolist()}')


Final number of features: 211
Remaining features: ['Monomer_Length', 'Monomer_Length_in_Main_Chain', 'Molecule_Shape', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'S

In [4]:
# One hot encoding for 'Molecule_Shape'
if 'Molecule_Shape' in features.columns:
    features = pd.get_dummies(features, columns=['Molecule_Shape'])
print(f"Number of features after one-hot encoding: {features.shape[1]}")
print(f"Remaining features after encoding: {features.columns.tolist()}")

Number of features after one-hot encoding: 212
Remaining features after encoding: ['Monomer_Length', 'Monomer_Length_in_Main_Chain', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 

In [5]:
from sklearn.preprocessing import StandardScaler

# Scale numerical features
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns, index=features.index)
target_scaled = scaler.fit_transform(target.values.reshape(-1, 1)).flatten()

In [6]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

# Convert features and target to tensors
X = torch.tensor(features_scaled.values, dtype=torch.float32)
y = torch.tensor(target_scaled, dtype=torch.float32).view(-1, 1)

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X, y = X.to(device), y.to(device)

# Simple neural network regressor as a substitute
class Regressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

In [7]:
# Initialize the model, loss function, and optimizer
input_dim = X.shape[1]
model = Regressor(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Create DataLoader for batching
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Training loop with early stopping
epochs = 1000
patience = 10
best_loss = float('inf')
epochs_no_improve = 0

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_X.size(0)
    epoch_loss = running_loss / len(dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

    # Early stopping check
    if epoch_loss < best_loss - 1e-5:
        best_loss = epoch_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

# Restore best model
model.load_state_dict(best_model_state)

Epoch 1/1000, Loss: 0.7153
Epoch 2/1000, Loss: 0.5594
Epoch 3/1000, Loss: 0.4918
Epoch 4/1000, Loss: 0.4666
Epoch 5/1000, Loss: 0.4415
Epoch 6/1000, Loss: 0.4221
Epoch 7/1000, Loss: 0.4076
Epoch 8/1000, Loss: 0.3976
Epoch 9/1000, Loss: 0.3898
Epoch 10/1000, Loss: 0.3722
Epoch 11/1000, Loss: 0.3734
Epoch 12/1000, Loss: 0.3728
Epoch 13/1000, Loss: 0.3623
Epoch 14/1000, Loss: 0.3530
Epoch 15/1000, Loss: 0.3562
Epoch 16/1000, Loss: 0.3543
Epoch 17/1000, Loss: 0.3458
Epoch 18/1000, Loss: 0.3413
Epoch 19/1000, Loss: 0.3412
Epoch 20/1000, Loss: 0.3358
Epoch 21/1000, Loss: 0.3350
Epoch 22/1000, Loss: 0.3268
Epoch 23/1000, Loss: 0.3266
Epoch 24/1000, Loss: 0.3255
Epoch 25/1000, Loss: 0.3264
Epoch 26/1000, Loss: 0.3202
Epoch 27/1000, Loss: 0.3304
Epoch 28/1000, Loss: 0.3149
Epoch 29/1000, Loss: 0.3126
Epoch 30/1000, Loss: 0.3108
Epoch 31/1000, Loss: 0.3095
Epoch 32/1000, Loss: 0.3140
Epoch 33/1000, Loss: 0.3179
Epoch 34/1000, Loss: 0.3062
Epoch 35/1000, Loss: 0.3060
Epoch 36/1000, Loss: 0.3046
E

<All keys matched successfully>

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Move model to eval mode and get predictions
model.eval()
with torch.no_grad():
    y_pred = model(X).cpu().numpy()
    y_true = y.cpu().numpy()

# Inverse transform to original scale
y_true_orig = scaler.inverse_transform(y_true)
y_pred_orig = scaler.inverse_transform(y_pred)

# RMSE
rmse = mean_squared_error(y_true_orig, y_pred_orig) ** 0.5
# MAE
mae = mean_absolute_error(y_true_orig, y_pred_orig)
# MAPE
mape = (np.abs((y_true_orig - y_pred_orig) / y_true_orig)).mean() * 100
# R2
r2 = r2_score(y_true_orig, y_pred_orig)

print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"R2: {r2:.4f}")

RMSE: 0.3728
MAE: 0.2638
MAPE: 4.59%
R2: 0.7751
