In [1]:
!pip install rdkit -q




In [3]:
import itertools
import random
import numpy as np
import pandas as pd
import datamol as dm

# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Import libraries for molecular featurization and handling
from molfeat.trans.fp import FPVecTransformer
from imblearn.over_sampling import RandomOverSampler
from molfeat.trans.concat import FeatConcat



In [4]:
# Load the training data from a CSV file, put your path to your train data
df_train = pd.read_csv("data/train_admet.csv", index_col=0)

# Get a list of unique properties in the dataset
properties = df_train.property.unique()

# Create empty lists to store the training and validation dataframes for each property
dfs_train = []
dfs_val = []

# Iterate over each property in dataset
for property in properties:
    # Select only the rows corresponding to the current property
    df_subset = df_train[df_train.property == property]

    # Split the subset into training and validation sets, stratifying by the target variable 'Y'
    df_subset_train, df_subset_val = train_test_split(
        df_subset, test_size=0.2, random_state=75, stratify=df_subset.Y
    )

    # Append the training and validation sets to the respective lists
    dfs_train.append(df_subset_train)
    dfs_val.append(df_subset_val)

# Concatenate all the training dataframes into a single dataframe
df_train = pd.concat(dfs_train, axis=0)

In [5]:
def create_molecule(row, old, new, max_new_mols=15):
    """
    Generates new molecules by replacing specific substructures within an existing molecule.

    Args:
        row (pandas.Series): A row from a DataFrame containing information about a molecule, including its SMILES string in the "Drug" column.
        old (str): The substructure to be replaced in the SMILES string.
        new (list): A list of candidate substructures to replace the old substructure.
        max_new_mols (int): The maximum number of new molecules to generate from a single input molecule.

    Returns:
        list: A list of dictionaries, where each dictionary represents a new molecule generated by the replacements.
    """
    new_rows = []  # Initialize an empty list to store the generated molecules
    drug = row.Drug  # Extract the SMILES string of the input molecule

    # Find all occurrences of the old substructure in the SMILES string
    positions = [i for i in range(len(drug)) if drug.startswith(old, i)]

    if len(positions) == 0:
        return new_rows  # Return an empty list if the old substructure is not found

    new.append(old)  # Add the old substructure to the list of replacement options

    # Generate all possible combinations of replacements for each occurrence of the old substructure
    options = list(itertools.product(new, repeat=len(positions)))

    # Randomly select a subset of the replacement combinations
    for replacement_combination in random.sample(
            options, k=min(max_new_mols, len(options))
    ):
        new_drug = list(drug)  # Create a mutable copy of the SMILES string
        offset = 0  # Initialize an offset to track changes in string length due to replacements

        # Perform the replacements
        for pos, replacement in zip(positions, replacement_combination):
            start = pos + offset
            end = start + len(old)
            new_drug[start:end] = replacement  # Replace the substructure
            offset += len(replacement) - len(old)  # Update the offset

        new_row = row.copy()  # Create a copy of the input row
        new_row["Drug"] = "".join(new_drug)  # Update the SMILES string in the new row

        # Add the new molecule to the list if the SMILES string has changed
        if new_row.Drug != row.Drug:
            new_rows.append(new_row)

    return new_rows


# Initialize an empty list to store all generated molecules
new_rows = []
for i, row in df_train.iterrows():
    max_new_mols = 20  # Set the maximum number of new molecules to generate per input molecule

    # Generate new molecules by replacing various substructures
    new_rows.extend(create_molecule(row, old="(C)", new=["(CC)", "(CCC)", "(C(C)C)"], max_new_mols=max_new_mols))
    new_rows.extend(create_molecule(row, old="(CC)", new=["(CCC)", "(C)", "(C(C)C)"], max_new_mols=max_new_mols))
    new_rows.extend(create_molecule(row, old="(CCC)", new=["(CC)", "(C)", "(C(C)C)"], max_new_mols=max_new_mols))
    new_rows.extend(create_molecule(row, old="(OC)", new=["(OCC)", "(OCCC)", "(OC(C)C)"], max_new_mols=max_new_mols))
    new_rows.extend(create_molecule(row, old="(OCC)", new=["(OCCC)", "(OC)", "(OC(C)C)"], max_new_mols=max_new_mols))
    new_rows.extend(create_molecule(row, old="(OCCC)", new=["(OCC)", "(OC)", "(OC(C)C)"], max_new_mols=max_new_mols))

In [6]:
# Concatenate the original DataFrame with the DataFrame of generated molecules
df_train_extended = pd.concat([df_train, pd.DataFrame(new_rows)], axis=0)

# Remove duplicate molecules based on the "Drug" column (SMILES string)
df_train_extended.drop_duplicates(subset=["Drug"], keep="first", inplace=True)

# Reset the index of the DataFrame
df_train_extended.reset_index(drop=True, inplace=True)

# Create a new column "Drug_ID" and assign a unique ID to each molecule
df_train_extended["Drug_ID"] = df_train_extended.index

# Display the extended DataFrame
df_train_extended

Unnamed: 0,Drug_ID,Drug,Y,property
0,0,CC(=O)Nc1ccc2ccc3c(O)ccc4ccc1c2c43,1,1
1,1,CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=...,1,1
2,2,ClC1OC1CBr,1,1
3,3,c1ccc2[nH]c(-c3cscn3)nc2c1,1,1
4,4,CC(C)(C)Br,1,1
...,...,...,...,...
16459,16459,CC(CC)(C)CC(=O)OCC(=O)[C@@]12OC(CC)(C)O[C@@H]1...,1,3
16460,16460,CC(CC)(CCC)CC(=O)OCC(=O)[C@@]12OC(CC)(CCC)O[C@...,1,3
16461,16461,CC(C)(CC)CC(=O)OCC(=O)[C@@]12OC(CC)(CCC)O[C@@H...,1,3
16462,16462,CC(CCC)(C)CC(=O)OCC(=O)[C@@]12OC(C)(CCC)O[C@@H...,1,3


In [7]:
# Concatenate the training DataFrames for each property into a single DataFrame
df_train_upsampled = pd.concat(dfs_train, axis=0)

# Convert the SMILES strings in the "Drug" column of the DataFrame into molecular objects
mols = dm.from_df(df_train_upsampled, smiles_column="Drug")



In [8]:
# Initialize fingerprint calculators
rdkit = FPVecTransformer("desc2D", n_jobs=8, dtype=np.float32, replace_nan=True)
maccs = FPVecTransformer("maccs", dtype=np.float32)
ecfp4 = FPVecTransformer("ecfp:4", dtype=np.float32)

# Combine fingerprint calculators into a single featurizer
featurizer = FeatConcat([maccs, ecfp4, rdkit], dtype=np.float32)

# Calculate fingerprints, suppressing RDKit log messages
with dm.without_rdkit_log():
    feats = featurizer(mols)

  from .autonotebook import tqdm as notebook_tqdm
please use MorganGenerator
[22:43:48] please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
[22:43:49] please use MorganGenerator

  min_charge, max_charge = np.nanmin(atomic_charges), np.nanmax(atomic_charges)
please use MorganGenerator


please use MorganGenerator


please use MorganGenerator
[22:43:51] please use MorganGenerator
please use MorganGenerator[22:43:51] 


please use MorganGenerator



please use MorganGenerator
please use MorganGenerator
please use MorganGenerator[22:43:52] 
please use MorganGenerator



please use MorganGenerator

please use MorganGenerator

please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator


please use MorganGenerator


please use MorganGenerator


please use MorganGenerator
please use MorganGenerator

please use MorganGenerator


ple

In [9]:
# Reset the index of the DataFrame with upsampled data
df_train_upsampled.reset_index(drop=True, inplace=True)

# Concatenate the upsampled DataFrame with the calculated fingerprints
df_train_featurized = pd.concat([df_train_upsampled, pd.DataFrame(feats)], axis=1)

In [10]:
df_train_featurized

Unnamed: 0,Drug_ID,Drug,Y,property,0,1,2,3,4,5,...,2373,2374,2375,2376,2377,2378,2379,2380,2381,2382
0,1271,CC(=O)Nc1ccc2ccc3c(O)ccc4ccc1c2c43,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,3091,CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=...,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4097,ClC1OC1CBr,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,1184,c1ccc2[nH]c(-c3cscn3)nc2c1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4046,CC(C)(C)Br,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345,7672,COc1ccccc1N1CCN(CCc2oc(=O)[nH]c2-c2ccc(F)cc2)CC1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6346,6981,NCCCNCCSP(=O)(O)O,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
6347,7881,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6348,7158,CO[C@H]1C[C@H](O[C@@H]2[C@@H](C)C(=O)O[C@H](C)...,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0


In [11]:
# Concatenate validation DataFrames for each property
df_val = pd.concat(dfs_val, axis=0)

# Convert SMILES strings in the validation set to molecular objects
mols_val = dm.from_df(df_val, smiles_column="Drug")

# Calculate fingerprints for the validation set, suppressing RDKit logs
with dm.without_rdkit_log():
    feats_val = featurizer(mols_val)

# Reset index of the validation DataFrame
df_val.reset_index(drop=True, inplace=True)

# Concatenate validation DataFrame with calculated fingerprints
df_val_featurized = pd.concat([df_val, pd.DataFrame(feats_val)], axis=1)



[22:44:15] 

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

[22:44:16] please use MorganGenerator
[22:44:16] please use MorganGenerator

please use MorganGenerator
[22:44:16] please use MorganGenerator


please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator


please use MorganGenerator



please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
[22:44:18] please use MorganGenerator
[22:44:18] please use MorganGenerator
please use MorganGenerator
[22:44:18] please use MorganGenerator
please use MorganGenerator
please use MorganGenerator[22:44:18] 
please use MorganGenerator


In [12]:
# One-hot encode the 'property' column in the training set
prop_encoding = pd.get_dummies(df_train_featurized.property).astype(np.float32)
df_train_featurized = pd.concat([df_train_featurized, prop_encoding], axis=1)

# One-hot encode the 'property' column in the validation set
prop_encoding_val = pd.get_dummies(df_val_featurized.property).astype(np.float32)
df_val_featurized = pd.concat([df_val_featurized, prop_encoding_val], axis=1)

In [13]:
feature_cols = df_train_featurized.iloc[:, 4:].columns

In [None]:
# Split the featurized training DataFrame back into DataFrames for each property
dfs_train_featurized = [
    df_train_featurized[df_train_featurized.property == prop] for prop in properties
]

# Split the featurized validation DataFrame back into DataFrames for each property
dfs_val_featurized = [
    df_val_featurized[df_val_featurized.property == prop] for prop in properties
]

In [15]:
scalers = []  # List to store scalers for each property
models = []  # List to store models for each property
preds = []  # List to store predictions for each property

# Iterate over each property
for i in range(len(dfs_train_featurized)):
    # Initialize a Random Forest classifier
    model = RandomForestClassifier(
        criterion="entropy", n_estimators=512, class_weight="balanced", n_jobs=8
    )

    # Extract features (X) and target variable (Y) for training and validation
    x_train, y_train = dfs_train_featurized[i][feature_cols], dfs_train_featurized[i].Y
    x_val, y_val = dfs_val_featurized[i][feature_cols], dfs_val_featurized[i].Y

    # Standardize the features using StandardScaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_val = scaler.transform(x_val)

    # Train the Random Forest model
    model.fit(x_train, y_train)

    # Store the scaler and model
    scalers.append(scaler)
    models.append(model)

    # Make predictions on the validation set and calculate ROC AUC
    y_pred = model.predict_proba(x_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print(f"Property {properties[i]} ROC AUC: {auc}")

Overall: 0.9117


In [33]:
# Load the test dataset. Put your path to your test dataset
df_test = pd.read_csv("data/test_data.csv", index_col=0)

# Convert SMILES strings in the test set to molecular objects
mols_test = dm.from_df(df_test, smiles_column="Drug")

# Calculate fingerprints for the test set, suppressing RDKit logs
with dm.without_rdkit_log():
    feats_test = featurizer(mols_test)

# Reset the index of the test DataFrame
df_test.reset_index(drop=True, inplace=True)

# Concatenate the test DataFrame with the calculated fingerprints
df_test_featurized = pd.concat([df_test, pd.DataFrame(feats_test)], axis=1)



please use MorganGenerator


please use MorganGenerator[19:11:44] 

[19:11:44] 


please use MorganGenerator





please use MorganGenerator
please use MorganGenerator





[19:11:44] please use MorganGenerator
please use MorganGenerator




please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator


please use MorganGenerator

please use MorganGenerator


please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator


please use MorganGenerator

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator


please u

In [None]:
preds = []  # Initialize an empty list to store predictions

# Iterate over each property
for i in range(len(properties)):
    # Select the subset of the test data corresponding to the current property
    df_subset_test = df_test_featurized[df_test_featurized.property == properties[i]]

    # Extract features from the subset and scale them using the previously fitted scaler
    x_test = scalers[i].transform(df_subset_test[feature_cols])

    # Use the corresponding trained model to predict probabilities for the positive class
    y_pred = models[i].predict_proba(x_test)[:, 1]

    # Append the predictions for the current property to the list
    preds.append(y_pred)

# Concatenate the predictions for all properties into a single array
preds = np.concatenate(preds, axis=0)

In [35]:
# Load the sample submission file
submission = pd.read_csv("data/sample.csv")

# Replace the placeholder predictions in the 'Y' column with the calculated predictions
submission["Y"] = preds

# Save the predictions to a CSV file for submission. Put your path.
submission.to_csv("submissions/submission.csv", index=False) 