In [14]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

In [15]:
# Load the compounds data
compounds = pd.read_csv("./data/compounds.csv")
num_compounds = compounds.shape[0]
num_descriptors = compounds.shape[1] - 3
print(f"Number of compounds >> {num_compounds}")
print(f"Number of descriptors >> {num_descriptors}")  # Exclude 'ID', 'NAME', 'CLASS'

Number of compounds >> 68
Number of descriptors >> 7


In [16]:
compounds.head(5)

Unnamed: 0,ID,NAME,CLASS,monomer_mw,XlogP3,h-bond_donors,h-bond_acceptors,complexity,concentration,polymer_mw
0,1,CHAPS,Surfactant,614.9,2.9,4.0,7.0,1030.0,20.0,614.0
1,2,Triton X-100,Surfactant,527.7,-2.0,1190.0,1704.0,731.0,25.0,80000.0
2,3,Alg,Polymer,448.5,-3.6,2676.0,4348.0,511.0,4.0,150000.0
3,4,AP,Polymer,828.7,-10.6,6516.0,12690.0,1210.0,10.0,213700.0
4,5,BSA,Protein,331.8,-2.3,1190.0,1704.0,391.0,40.0,66000.0


In [17]:
# Load the solution mixture pairs
pairs = np.genfromtxt("./data/pairs.csv", delimiter=",")
print(f"Shape of pairs >> {pairs.shape}")

Shape of pairs >> (68, 68)


In [18]:
# Normalize the 'compounds' data

# Extract the descriptors
descriptors = compounds.loc[:, "monomer_mw":]
scaler = StandardScaler()
norm_descriptors = scaler.fit_transform(descriptors)

In [19]:
# Split the 'pairs' data into train & validate
np.random.seed(42)

num_pairs = pairs.size
num_training_examples = int(0.7 * num_pairs)

# Select the entries for training and return their indices
training_indices_flat = np.random.choice(
    num_pairs, num_training_examples, replace=False
)
training_indices_rows, training_indices_cols = np.unravel_index(
    training_indices_flat, pairs.shape
)
training_indices = tuple(zip(training_indices_rows, training_indices_cols))

# Make the remaining indices as validation set
all_indices = set(np.ndindex(pairs.shape))
val_indices = tuple(all_indices - set(training_indices))

# print(training_indices)
# print(val_indices)

In [None]:
# Make actual training data & map labels
X_train_shape = (len(training_indices), 2 * num_descriptors)
X_train = np.zeros(X_train_shape)