In [131]:
import pandas as pd
import numpy as np

In [132]:
#Load training data
train_df = pd.read_csv("train.csv").set_index("id")

In [133]:
train_df = train_df.sort_values("id")

In [134]:
train_df.head(4)

Unnamed: 0_level_0,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Clc1ccc(Cl)c(c1)C(=O)O,427.55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ClCCN(C)CCCl,213.15,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Oc1ccc2c(CCC3C2CCC4(C)C(=O)CCC34)c1,533.35,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,c1ccc2c(c1)ccc3cc4c(ccc5ccccc45)cc23,542.65,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Define function to add relevant features from the SMILES using RDkit

In [135]:
#Load testing data
test_df = pd.read_csv("test.csv").set_index("id")
test_df = test_df.sort_values("id")
test_df.head(4)

Unnamed: 0_level_0,SMILES,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,Group 9,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,COc1ccc(cc1)NC(=O)C,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CCOC(=O)N,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,CC(C)CCCC(C)CCCC(C)CCCC1(C)CCc2cc(O)cc(C)c2O1,5,9,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,CCCCCCCCCCCCCCCCCC(=O)O,1,16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [136]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, rdFingerprintGenerator


morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def get_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()  

    # Get Mol objects from SMILES
    df["mol"] = df["SMILES"].apply(
        lambda s: Chem.MolFromSmiles(s) if pd.notna(s) else None
    )

    # Atom counts
    count_N   = lambda m: sum(1 for a in m.GetAtoms() if a.GetAtomicNum() == 7)
    count_O   = lambda m: sum(1 for a in m.GetAtoms() if a.GetAtomicNum() == 8)
    count_S   = lambda m: sum(1 for a in m.GetAtoms() if a.GetAtomicNum() == 16)
    count_hal = lambda m: sum(
        1 for a in m.GetAtoms() if a.GetAtomicNum() in (9, 17, 35, 53, 85)
    )

    # Descriptors to generate
    descs = {
        "MolWt": Descriptors.MolWt,
        "LogP": Descriptors.MolLogP,
        "TPSA": Descriptors.TPSA,
        "HBD": Descriptors.NumHDonors,
        "HBA": Descriptors.NumHAcceptors,
        "RotB": Descriptors.NumRotatableBonds,
        "RingCount": Descriptors.RingCount,
        "FracCSP3": rdMolDescriptors.CalcFractionCSP3,
        "NumAromaticRings": rdMolDescriptors.CalcNumAromaticRings,
        "NumHeteroatoms": rdMolDescriptors.CalcNumHeteroatoms,
        "BertzCT": Descriptors.BertzCT,
        "Kappa1": Descriptors.Kappa1,
        "Kappa2": Descriptors.Kappa2,
        "Kappa3": Descriptors.Kappa3,
        "Chi1v": Descriptors.Chi1v,
        "LabuteASA": rdMolDescriptors.CalcLabuteASA,
        "HeavyAtomCount": Descriptors.HeavyAtomCount,
        "NumAromaticAtoms": lambda m: sum(a.GetIsAromatic() for a in m.GetAtoms()),
        "NumAliphaticRings": rdMolDescriptors.CalcNumAliphaticRings,
        "NumSaturatedRings": rdMolDescriptors.CalcNumSaturatedRings,
        "NumAromaticHeterocycles": rdMolDescriptors.CalcNumAromaticHeterocycles,
        "NumAromaticCarbocycles": rdMolDescriptors.CalcNumAromaticCarbocycles,
        "MolMR": Descriptors.MolMR,
        "NumN": count_N,
        "NumO": count_O,
        "NumS": count_S,
        "NumHalogen": count_hal,
    }

    # Compute RDKit descriptors
    for name, fn in descs.items():
        df[name] = df["mol"].apply(
            lambda m: fn(m) if m is not None else float("nan")
        )

    # Morgan fingerprints (radius=2, 2048 bits)
    def get_morgan_fp(m):
        if m is None:
            return [0] * 2048
        fp = morgan_gen.GetFingerprint(m)
        return list(fp)

    fp_df = df["mol"].apply(get_morgan_fp).apply(pd.Series)
    fp_df.columns = [f"FP_{i}" for i in range(fp_df.shape[1])]

    df = pd.concat([df, fp_df], axis=1)

    df = df.drop(columns=["mol"])

    return df

In [137]:
#Generate features for the training data
train_df_features = get_features(train_df)
train_df_features.head(4)

Unnamed: 0_level_0,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,...,FP_2038,FP_2039,FP_2040,FP_2041,FP_2042,FP_2043,FP_2044,FP_2045,FP_2046,FP_2047
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Clc1ccc(Cl)c(c1)C(=O)O,427.55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ClCCN(C)CCCl,213.15,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Oc1ccc2c(CCC3C2CCC4(C)C(=O)CCC34)c1,533.35,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,c1ccc2c(c1)ccc3cc4c(ccc5ccccc45)cc23,542.65,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
train_df_features.shape

(2662, 2501)

In [139]:
#Generate features for the testing data
test_df_features = get_features(test_df)
test_df_features.head(4)

Unnamed: 0_level_0,SMILES,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,Group 9,...,FP_2038,FP_2039,FP_2040,FP_2041,FP_2042,FP_2043,FP_2044,FP_2045,FP_2046,FP_2047
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,COc1ccc(cc1)NC(=O)C,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CCOC(=O)N,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,CC(C)CCCC(C)CCCC(C)CCCC1(C)CCc2cc(O)cc(C)c2O1,5,9,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,CCCCCCCCCCCCCCCCCC(=O)O,1,16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
test_df_features.shape

(666, 2500)

#### Export datasets with added features

In [141]:
train_df_features.to_csv("train_features.csv", index = True)
test_df_features.to_csv("test_features.csv", index = True)