# Importing packages

In [1]:
import pandas as pd
import numpy as np

from chemprop import data, utils

# Change data inputs here

In [2]:
test_path = '../tests/data/regression.csv'
smiles_column = 'smiles'
target_columns = ['logSolubility']

period_table_path = '../tests/data/periodic_table_of_elements.csv'
feature_headings = ['AtomicRadius', 'Electronegativity', 'Metal', 'Nonmetal', 'Metalloid']

## Load data

In [3]:
df_test = pd.read_csv(test_path)
df_test

Unnamed: 0,smiles,logSolubility
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770
1,Cc1occc1C(=O)Nc2ccccc2,-3.300
2,CC(C)=CCCC(C)=CC(=O),-2.060
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870
4,c1ccsc1,-1.330
...,...,...
495,Nc1cc(nc(N)n1=O)N2CCCCC2,-1.989
496,Nc2cccc3nc1ccccc1cc23,-4.220
497,c1ccc2cc3c4cccc5cccc(c3cc2c1)c45,-8.490
498,OC(c1ccc(Cl)cc1)(c2ccc(Cl)cc2)C(Cl)(Cl)Cl,-5.666


In [4]:
pt_fs = pd.read_csv(period_table_path)
pt_fs

FileNotFoundError: [Errno 2] No such file or directory: 'tests/data/periodic_table_of_elements.csv'

In [None]:
smis = df_test['smiles']

# Load atom features

In [None]:
features = pt_fs.loc[:, feature_headings]
features

# Generating features
Define a function to generate an array of atom (vertex) features ```vf``` and bond (edge) features ```ef``` given a SMILES molecule.

For each atom, we include its
- Atomic Radius
- Electronegativity
- A one-hot encoding for whether it is a metal, nonmetal or metalloid.

For each bond, we include the polarity of the bond, computed using the difference in electronegativity between the atoms.

In [None]:
def calculate_extra_features_from_smiles(smi, keep_h=False, add_h=False):
    mol = utils.make_mol(smi, keep_h, add_h)
    atoms = mol.GetAtoms()

    # Atom features denoted by V for vertex
    vf = []
    for atom in atoms:
        z = atom.GetAtomicNum()
        x_v = features[:].iloc[z - 1].values.tolist()
        vf.append([f if f != "nan" else 0 for f in x_v])

    # Bond features denoted by E for edges
    ef = np.empty((2 * mol.GetNumBonds(), 1))
    i = 0
    for u in range(len(atoms)):
        for v in range(u + 1, len(atoms)):
            bond = mol.GetBondBetweenAtoms(u, v)
            if bond is None:
                continue

            u_elec = pt_fs['Electronegativity'].iloc[atoms[u].GetAtomicNum()]
            v_elec = pt_fs['Electronegativity'].iloc[atoms[v].GetAtomicNum()]

            x_e = [abs(u_elec - v_elec)]

            ef[i : i + 2] = x_e

            i += 2

    vf = np.array(vf)
    return (vf, ef)

In [None]:
ys = df_test.loc[:, target_columns].values
fs = [calculate_extra_features_from_smiles(smi) for smi in smis]
fs

In [None]:
all_data = [data.MoleculeDatapoint.from_smi(smi, y=y, V_f=f[0], E_f=f[1]) for smi, y, f in zip(smis, ys, fs)]
all_data