In [1]:
import pandas as pd
import numpy as np

In [2]:
#Load training data
train_df = pd.read_csv("train.csv").set_index("id")

In [3]:
cols_to_drop = train_df.columns[train_df.columns.str.startswith("Group")]
train_df = train_df.drop(columns = cols_to_drop)

In [4]:
train_df.head(4)

Unnamed: 0_level_0,SMILES,Tm
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2175,FC1=C(F)C(F)(F)C1(F)F,213.15
1222,c1ccc2c(c1)ccc3Nc4ccccc4c23,407.15
2994,CCN1C(C)=Nc2ccccc12,324.15
1704,CC#CC(=O)O,351.15


In [5]:
# -------------------- LOAD & INITIAL CLEAN OF Full new dataset -------------------- #

# Load new MP data; use 'key' as index (mainly for traceability)
new_data = pd.read_csv("FullNewMps.csv").set_index("key")

# Keep only rows that are NOT flagged as 'do not use'
new_data = new_data[new_data["donotuse"].isna()]

# Drop metadata / unusable columns that won't be used for modelling
cols_to_drop = ["csid", "link", "source", "donotuse", "donotusebecause"]
new_data = new_data.drop(columns=cols_to_drop, errors="ignore")

# -------------------- ALIGN MP RANGE WITH ORIGINAL TRAINING DATA -------------------- #

# Get Mp range from the original training data
tm_min = train_df["Tm"].min()
tm_max = train_df["Tm"].max()

# Keep only new entries whose Mp lies within the original training range
new_data = new_data[new_data["mpC"].between(tm_min, tm_max)]

# -------------------- HANDLE DUPLICATES & COLUMN ALIGNMENT -------------------- #

# Drop duplicate compounds by name in the new dataset
new_data = new_data.drop_duplicates(subset="name")

# Remove 'name' column (not used by the model)
new_data = new_data.drop(columns=["name"])

# Rename columns to match the existing training dataframe
new_data = new_data.rename(columns={"mpC": "Tm", "smiles": "SMILES"})


# Regex pattern: any bracketed atom that contains + or -
charge_pattern = r"\[\S*[\+\-]\S*\]"

# Keep only neutral SMILES in the new dataset
new_data = new_data[
    ~new_data["SMILES"].str.contains(charge_pattern, regex=True, na=False)
]

#Name the index "id"
new_data.index.name = "id"

# Quick overview of the final combined dataset
new_data.head(4)

Unnamed: 0_level_0,SMILES,Tm
id,Unnamed: 1_level_1,Unnamed: 2_level_1
4,c1ccc(c(c1)N2CCNCC2)O,125.0
6,CC(C)(C)OC(=O)N1CCC(CC1)OCC(=O)NC,95.0
7,CC(C)(C)OC(=O)N1CCC(CC1)OCC(=O)NC2CC2,86.0
8,CC(C)(C)OC(=O)N1CCC(CC1)OCC(=O)N(C)C,58.0


In [6]:
# -------------------- LOAD & INITIAL CLEAN OF curated new dataset -------------------- #

# Load new MP data; use 'key' as index (mainly for traceability)
cur_data = pd.read_csv("DPGMP.csv").set_index("key")

# Drop metadata / unusable columns that won't be used for modelling
cols_to_drop = ["csid", "link", "source", "count", "min", "max", "range"]
cur_data = cur_data.drop(columns=cols_to_drop, errors="ignore")

# -------------------- ALIGN MP RANGE WITH ORIGINAL TRAINING DATA -------------------- #

# Get Mp range from the original training data
tm_min = train_df["Tm"].min()
tm_max = train_df["Tm"].max()

# Keep only new entries whose Mp lies within the original training range
cur_data = cur_data[cur_data["mpC"].between(tm_min, tm_max)]

# -------------------- HANDLE DUPLICATES & COLUMN ALIGNMENT -------------------- #

# Drop duplicate compounds by name in the new dataset
cur_data = cur_data.drop_duplicates(subset="name")

# Remove 'name' column (not used by the model)
cur_data = cur_data.drop(columns=["name"])

# Rename columns to match the existing training dataframe
cur_data = cur_data.rename(columns={"mpC": "Tm", "smiles": "SMILES"})


# -------------------- FILTER OUT CHARGED SPECIES FROM NEW DATA -------------------- #

# Regex pattern: any bracketed atom that contains + or -
charge_pattern = r"\[\S*[\+\-]\S*\]"

# Keep only neutral SMILES in the new dataset
cur_data = cur_data[
    ~cur_data["SMILES"].str.contains(charge_pattern, regex=True, na=False)
]

#Name the index "id"
cur_data.index.name = "id"

# Quick overview of the final combined dataset
cur_data.head(4)

Unnamed: 0_level_0,SMILES,Tm
id,Unnamed: 1_level_1,Unnamed: 2_level_1
17138,CC(C)N(CCC(c1ccccn1)(c2ccccc2)C(N)=O)C(C)C,94.8
16624,O=C(O)C2=CN(CC)c1c(F)c(c(F)cc1C2=O)N3CC(C)NCC3,239.75
15862,II,113.5
8930,COc5ccc(CCN1CCC(CC1)Nc3nc2ccccc2n3Cc4ccc(F)cc4...,149.0


In [7]:
#Load testing data
test_df = pd.read_csv("test.csv").set_index("id")
test_df = test_df.sort_values("id")

#Drop group columns form testing data
test_df = test_df.drop(columns = test_df.columns[test_df.columns.str.startswith("Group")])

test_df.head(4)

Unnamed: 0_level_0,SMILES
id,Unnamed: 1_level_1
2,COc1ccc(cc1)NC(=O)C
4,CCOC(=O)N
9,CC(C)CCCC(C)CCCC(C)CCCC1(C)CCc2cc(O)cc(C)c2O1
18,CCCCCCCCCCCCCCCCCC(=O)O


#### Define function to add relevant features from the SMILES using RDkit

In [38]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, rdFingerprintGenerator

from rdkit import RDLogger
RDLogger.DisableLog("rdApp.*")


morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

def safe_mol_from_smiles(s: str):
    if pd.isna(s):
        return None
    try:
        return Chem.MolFromSmiles(s)
    except Exception:
        return None

def get_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()  

    # Get Mol objects from SMILES
    df["mol"] = df["SMILES"].apply(safe_mol_from_smiles)

    # Drop invalid molecules
    valid_mask = df["mol"].notnull()
    print(f"Dropping {(~valid_mask).sum()} rows with invalid SMILES.")
    df = df[valid_mask].copy()

    # Atom counts
    count_N   = lambda m: sum(1 for a in m.GetAtoms() if a.GetAtomicNum() == 7)
    count_O   = lambda m: sum(1 for a in m.GetAtoms() if a.GetAtomicNum() == 8)
    count_S   = lambda m: sum(1 for a in m.GetAtoms() if a.GetAtomicNum() == 16)
    count_hal = lambda m: sum(
        1 for a in m.GetAtoms() if a.GetAtomicNum() in (9, 17, 35, 53, 85)
    )

    # Descriptors to generate
    descs = {
        "MolWt": Descriptors.MolWt,
        "LogP": Descriptors.MolLogP,
        "TPSA": Descriptors.TPSA,
        "HBD": Descriptors.NumHDonors,
        "HBA": Descriptors.NumHAcceptors,
        "RotB": Descriptors.NumRotatableBonds,
        "RingCount": Descriptors.RingCount,
        "FracCSP3": rdMolDescriptors.CalcFractionCSP3,
        "NumAromaticRings": rdMolDescriptors.CalcNumAromaticRings,
        "NumHeteroatoms": rdMolDescriptors.CalcNumHeteroatoms,
        "BertzCT": Descriptors.BertzCT,
        "Kappa1": Descriptors.Kappa1,
        "Kappa2": Descriptors.Kappa2,
        "Kappa3": Descriptors.Kappa3,
        "Chi1v": Descriptors.Chi1v,
        "LabuteASA": rdMolDescriptors.CalcLabuteASA,
        "HeavyAtomCount": Descriptors.HeavyAtomCount,
        "NumAromaticAtoms": lambda m: sum(a.GetIsAromatic() for a in m.GetAtoms()),
        "NumAliphaticRings": rdMolDescriptors.CalcNumAliphaticRings,
        "NumSaturatedRings": rdMolDescriptors.CalcNumSaturatedRings,
        "NumAromaticHeterocycles": rdMolDescriptors.CalcNumAromaticHeterocycles,
        "NumAromaticCarbocycles": rdMolDescriptors.CalcNumAromaticCarbocycles,
        "MolMR": Descriptors.MolMR,
        "NumN": count_N,
        "NumO": count_O,
        "NumS": count_S,
        "NumHalogen": count_hal,
    
        # Additional connectivity indices (capture branching / shape)
        "Chi0v": Descriptors.Chi0v,
        "Chi0n": Descriptors.Chi0n,
        "Chi1n": Descriptors.Chi1n,
        "Chi2v": Descriptors.Chi2v,
        "Chi2n": Descriptors.Chi2n,

        # LogS / polarity-ish surrogate (sometimes weakly helpful)
        "SlogP_VSA1": Descriptors.SlogP_VSA1,
        "SlogP_VSA2": Descriptors.SlogP_VSA2,

        # Partial charge stats
        "MaxPartialCharge": Descriptors.MaxPartialCharge,
        "MinPartialCharge": Descriptors.MinPartialCharge,

        # More shape / 3Dâ€™ish approximations (even in 2D mode)
        "HallKierAlpha": Descriptors.HallKierAlpha,
    }

    # Compute RDKit descriptors
    for name, fn in descs.items():
        df[name] = df["mol"].apply(
            lambda m: fn(m) if m is not None else float("nan")
        )

    # Morgan fingerprints (radius=2, 4096 bits)
    def get_morgan_fp(m):
        if m is None:
            return [0] * 4096
        fp = morgan_gen.GetFingerprint(m)
        return list(fp)

    fp_df = df["mol"].apply(get_morgan_fp).apply(pd.Series)
    fp_df.columns = [f"FP_{i}" for i in range(fp_df.shape[1])]

    df = pd.concat([df, fp_df], axis=1)

    df = df.drop(columns=["mol"])

    return df

In [39]:
#Generate features for the training data
train_features = get_features(train_df)

Dropping 0 rows with invalid SMILES.


In [40]:
train_features.shape

(2662, 2087)

In [41]:
#Generate features for the curated dataset
cur_features = get_features(cur_data)

Dropping 12 rows with invalid SMILES.


In [42]:
cur_features.shape

(1478, 2087)

In [43]:
#Generate features for the full additional
full_features = get_features(new_data)

Dropping 206 rows with invalid SMILES.


In [44]:
full_features.shape

(15247, 2087)

In [45]:
#Generate features for the testing data
test_df_features = get_features(test_df)

Dropping 0 rows with invalid SMILES.


In [46]:
test_df_features.shape

(666, 2086)

#### Export datasets with added features

In [47]:
train_features.to_csv("train_features.csv", index = True)
cur_features.to_csv("cur_features.csv", index = True)
full_features.to_csv("full_features.csv", index = True)
test_df_features.to_csv("test_features.csv", index = True)