In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

In [2]:
# Load the compounds data
compounds = pd.read_csv("./data/compounds.csv")
num_compounds = compounds.shape[0]
num_descriptors = compounds.shape[1] - 3
print(f"Number of compounds >> {num_compounds}")
print(f"Number of descriptors >> {num_descriptors}")  # Excluded 'ID', 'NAME', 'CLASS'

Number of compounds >> 68
Number of descriptors >> 7


In [3]:
compounds.head(5)

Unnamed: 0,ID,NAME,CLASS,monomer_mw,XlogP3,h-bond_donors,h-bond_acceptors,complexity,concentration,polymer_mw
0,1,CHAPS,Surfactant,614.9,2.9,4.0,7.0,1030.0,20.0,614.0
1,2,Triton X-100,Surfactant,527.7,-2.0,1190.0,1704.0,731.0,25.0,80000.0
2,3,Alg,Polymer,448.5,-3.6,2676.0,4348.0,511.0,4.0,150000.0
3,4,AP,Polymer,828.7,-10.6,6516.0,12690.0,1210.0,10.0,213700.0
4,5,BSA,Protein,331.8,-2.3,1190.0,1704.0,391.0,40.0,66000.0


In [4]:
# Load the solution mixture pairs
pairs = np.genfromtxt("./data/pairs.csv", delimiter=",")
print(f"Shape of pairs >> {pairs.shape}")

Shape of pairs >> (68, 68)


In [5]:
# Normalize the 'compounds' data

# Extract the descriptors
descriptors = compounds.loc[:, "monomer_mw":]
scaler = StandardScaler()
norm_descriptors = scaler.fit_transform(descriptors)

In [11]:
# Extract the lower triangular part of the pairs matrix
pairs_tril_indices = np.tril_indices_from(pairs, k=-1)
pairs_tril = pairs[pairs_tril_indices]

# Split data into train & validate
np.random.seed(42)

num_pairs = pairs_tril.size

# Training set
num_training_examples = int(0.7 * num_pairs)

training_indices_flat = np.random.choice(
    num_pairs, num_training_examples, replace=False
)

training_indices_rows = pairs_tril_indices[0][training_indices_flat]
training_indices_cols = pairs_tril_indices[1][training_indices_flat]

training_indices = tuple(zip(training_indices_rows, training_indices_cols))

# Validation set
val_indices_flat = np.setdiff1d(np.arange(num_pairs), training_indices_flat)
val_indices_rows = pairs_tril_indices[0][val_indices_flat]
val_indices_cols = pairs_tril_indices[1][val_indices_flat]

val_indices = tuple(zip(val_indices_rows, val_indices_cols))

In [12]:
# Make actual training data & map labels
X_train_shape = (len(training_indices), 2 * num_descriptors)
X_train = np.zeros(X_train_shape)
Y_train = np.zeros(len(training_indices))

for i, pair in enumerate(training_indices):
    compound1 = pair[0]
    compound2 = pair[1]
    X_train[i, :num_descriptors] = norm_descriptors[compound1, :]
    X_train[i, num_descriptors:] = norm_descriptors[compound2, :]
    Y_train[i] = pairs[pair]

In [13]:
# Make validation data
X_val_shape = (len(val_indices), 2 * num_descriptors)
X_val = np.zeros(X_val_shape)
Y_val = np.zeros(len(val_indices))

for i, pair in enumerate(val_indices):
    compound1 = pair[0]
    compound2 = pair[1]
    X_val[i, :num_descriptors] = norm_descriptors[compound1, :]
    X_val[i, num_descriptors:] = norm_descriptors[compound2, :]
    Y_val[i] = pairs[pair]