In [1]:
%cd ..

/home/nikita/edu/competitions/admet


In [3]:
import itertools
import random
import numpy as np
import pandas as pd
import datamol as dm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler

from molfeat.calc import RDKitDescriptors2D, FPCalculator, MordredDescriptors
from molfeat.trans import MoleculeTransformer
from molfeat.trans.concat import FeatConcat
from molfeat.trans.fp import FPVecTransformer

from catboost import CatBoostClassifier
from xgboost import XGBRFClassifier

In [4]:
df_train = pd.read_csv("data/train_admet.csv", index_col=0)
properties = df_train.property.unique()

dfs_train = []
dfs_val = []
for property in properties:
    df_subset = df_train[df_train.property == property]
    df_subset_train, df_subset_val = train_test_split(
        df_subset, test_size=0.2, random_state=75, stratify=df_subset.Y
    )
    dfs_train.append(df_subset_train)
    dfs_val.append(df_subset_val)

df_train = pd.concat(dfs_train, axis=0)

In [5]:
def create_molecule(row, old, new, max_new_mols=15):
    new_rows = []
    drug = row.Drug
    positions = [i for i in range(len(drug)) if drug.startswith(old, i)]

    if len(positions) == 0:
        return new_rows

    new.append(old)
    options = list(itertools.product(new, repeat=len(positions)))
    for replacement_combination in random.sample(
        options, k=min(max_new_mols, len(options))
    ):
        new_drug = list(drug)
        offset = 0
        for pos, replacement in zip(positions, replacement_combination):
            start = pos + offset
            end = start + len(old)
            new_drug[start:end] = replacement
            offset += len(replacement) - len(old)

        new_row = row.copy()
        new_row["Drug"] = "".join(new_drug)
        if new_row.Drug != row.Drug:
            new_rows.append(new_row)

    return new_rows


new_rows = []
for i, row in df_train.iterrows():
    max_new_mols = 20
    new_rows.extend(
        create_molecule(
            row,
            old="(C)",
            new=["(CC)", "(CCC)", "(C(C)C)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(CC)",
            new=["(CCC)", "(C)", "(C(C)C)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(CCC)",
            new=["(CC)", "(C)", "(C(C)C)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(OC)",
            new=["(OCC)", "(OCCC)", "(OC(C)C)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(OCC)",
            new=["(OCCC)", "(OC)", "(OC(C)C)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(OCCC)",
            new=["(OCC)", "(OC)", "(OC(C)C)"],
            max_new_mols=max_new_mols,
        )
    )

In [6]:
df_train_extended = pd.concat([df_train, pd.DataFrame(new_rows)], axis=0)
df_train_extended.drop_duplicates(subset=["Drug"], keep="first", inplace=True)
df_train_extended.reset_index(drop=True, inplace=True)
df_train_extended["Drug_ID"] = df_train_extended.index

df_train_extended

Unnamed: 0,Drug_ID,Drug,Y,property
0,0,CC(=O)Nc1ccc2ccc3c(O)ccc4ccc1c2c43,1,1
1,1,CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=...,1,1
2,2,ClC1OC1CBr,1,1
3,3,c1ccc2[nH]c(-c3cscn3)nc2c1,1,1
4,4,CC(C)(C)Br,1,1
...,...,...,...,...
16459,16459,CC(CC)(C)CC(=O)OCC(=O)[C@@]12OC(CC)(C)O[C@@H]1...,1,3
16460,16460,CC(CC)(CCC)CC(=O)OCC(=O)[C@@]12OC(CC)(CCC)O[C@...,1,3
16461,16461,CC(C)(CC)CC(=O)OCC(=O)[C@@]12OC(CC)(CCC)O[C@@H...,1,3
16462,16462,CC(CCC)(C)CC(=O)OCC(=O)[C@@]12OC(C)(CCC)O[C@@H...,1,3


In [7]:
df_train_upsampled = pd.concat(dfs_train, axis=0)

mols = dm.from_df(df_train_upsampled, smiles_column="Drug")



In [8]:
rdkit = FPVecTransformer("desc2D", n_jobs=8, dtype=np.float32, replace_nan=True)
maccs = FPVecTransformer("maccs", dtype=np.float32)
ecfp4 = FPVecTransformer("ecfp:4", dtype=np.float32)

# Wrap the calculator in a transformer instance
featurizer = FeatConcat([maccs, ecfp4, rdkit], dtype=np.float32)

with dm.without_rdkit_log():
    feats = featurizer(mols)

  from .autonotebook import tqdm as notebook_tqdm
please use MorganGenerator
[22:43:48] please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
[22:43:49] please use MorganGenerator

  min_charge, max_charge = np.nanmin(atomic_charges), np.nanmax(atomic_charges)
please use MorganGenerator


please use MorganGenerator


please use MorganGenerator
[22:43:51] please use MorganGenerator
please use MorganGenerator[22:43:51] 


please use MorganGenerator



please use MorganGenerator
please use MorganGenerator
please use MorganGenerator[22:43:52] 
please use MorganGenerator



please use MorganGenerator

please use MorganGenerator

please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator


please use MorganGenerator


please use MorganGenerator


please use MorganGenerator
please use MorganGenerator

please use MorganGenerator


ple

In [9]:
df_train_upsampled.reset_index(drop=True, inplace=True)
df_train_featurized = pd.concat([df_train_upsampled, pd.DataFrame(feats)], axis=1)

In [10]:
df_train_featurized

Unnamed: 0,Drug_ID,Drug,Y,property,0,1,2,3,4,5,...,2373,2374,2375,2376,2377,2378,2379,2380,2381,2382
0,1271,CC(=O)Nc1ccc2ccc3c(O)ccc4ccc1c2c43,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,3091,CC1=C(/C=C/C(C)=C/C=C/C(C)=C/C=C/C=C(C)/C=C/C=...,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4097,ClC1OC1CBr,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,1184,c1ccc2[nH]c(-c3cscn3)nc2c1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4046,CC(C)(C)Br,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345,7672,COc1ccccc1N1CCN(CCc2oc(=O)[nH]c2-c2ccc(F)cc2)CC1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6346,6981,NCCCNCCSP(=O)(O)O,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
6347,7881,c1cnc(N2CCN(Cc3ccc4c(c3)OCO4)CC2)nc1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6348,7158,CO[C@H]1C[C@H](O[C@@H]2[C@@H](C)C(=O)O[C@H](C)...,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0


In [11]:
df_val = pd.concat(dfs_val, axis=0)
mols_val = dm.from_df(df_val, smiles_column="Drug")

with dm.without_rdkit_log():
    feats_val = featurizer(mols_val)

df_val.reset_index(drop=True, inplace=True)
df_val_featurized = pd.concat([df_val, pd.DataFrame(feats_val)], axis=1)



[22:44:15] 

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

[22:44:16] please use MorganGenerator
[22:44:16] please use MorganGenerator

please use MorganGenerator
[22:44:16] please use MorganGenerator


please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator


please use MorganGenerator



please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
[22:44:18] please use MorganGenerator
[22:44:18] please use MorganGenerator
please use MorganGenerator
[22:44:18] please use MorganGenerator
please use MorganGenerator
please use MorganGenerator[22:44:18] 
please use MorganGenerator


In [12]:
prop_encoding = pd.get_dummies(df_train_featurized.property).astype(np.float32)
df_train_featurized = pd.concat([df_train_featurized, prop_encoding], axis=1)
prop_encoding_val = pd.get_dummies(df_val_featurized.property).astype(np.float32)
df_val_featurized = pd.concat([df_val_featurized, prop_encoding_val], axis=1)

In [13]:
feature_cols = df_train_featurized.iloc[:, 4:].columns

In [None]:
dfs_train_featurized = [
    df_train_featurized[df_train_featurized.property == prop] for prop in properties
]

dfs_val_featurized = [
    df_val_featurized[df_val_featurized.property == prop] for prop in properties
]

In [15]:
scalers = []
models = []
preds = []

for i in range(len(dfs_train_featurized)):
    model = RandomForestClassifier(
        criterion="entropy", n_estimators=512, class_weight="balanced", n_jobs=8,
    )
    x_train, y_train = dfs_train_featurized[i][feature_cols], dfs_train_featurized[i].Y
    x_val, y_val = dfs_val_featurized[i][feature_cols], dfs_val_featurized[i].Y
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_val = scaler.transform(x_val)
    model.fit(x_train, y_train)

    scalers.append(scaler)
    models.append(model)

    y_pred = model.predict_proba(x_val)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    print(f"Property {properties[i]} ROC AUC: {auc}")


Overall: 0.9117


In [33]:
df_test = pd.read_csv("data/test_data.csv", index_col=0)

mols_test = dm.from_df(df_test, smiles_column="Drug")

with dm.without_rdkit_log():
    feats_test = featurizer(mols_test)

df_test.reset_index(drop=True, inplace=True)
df_test_featurized = pd.concat([df_test, pd.DataFrame(feats_test)], axis=1)



please use MorganGenerator


please use MorganGenerator[19:11:44] 

[19:11:44] 


please use MorganGenerator





please use MorganGenerator
please use MorganGenerator





[19:11:44] please use MorganGenerator
please use MorganGenerator




please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator

please use MorganGenerator


please use MorganGenerator

please use MorganGenerator


please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator


please use MorganGenerator

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator


please u

In [None]:
preds = []
for i in range(len(properties)):
    df_subset_test = df_test_featurized[df_test_featurized.property == properties[i]]
    x_test = scaler.transform(df_subset_test[feature_cols])
    y_pred = model.predict_proba(x_test)[:, 1]
    preds.append(y_pred)
preds = np.concatenate(preds, axis=0)

In [35]:
submission = pd.read_csv("data/sample.csv")
submission["Y"] = preds
submission.to_csv("submissions/submission.csv", index=False)