In [1]:
%cd ..

/home/nikita/edu/competitions/admet


In [2]:
import itertools
import random
import numpy as np
import pandas as pd
import datamol as dm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler

from molfeat.calc import RDKitDescriptors2D, FPCalculator, MordredDescriptors
from molfeat.trans import MoleculeTransformer
from molfeat.trans.concat import FeatConcat
from molfeat.trans.fp import FPVecTransformer

from catboost import CatBoostClassifier
from xgboost import XGBRFClassifier

In [3]:
df_train = pd.read_csv("data/train_admet.csv", index_col=0)
properties = df_train.property.unique()

dfs_train = []
for property in properties:
    df_subset = df_train[df_train.property == property]
    dfs_train.append(df_subset)

df_train = pd.concat(dfs_train, axis=0)

In [4]:
def create_molecule(row, old, new, max_new_mols=40):
    new_rows = []
    drug = row.Drug
    positions = [i for i in range(len(drug)) if drug.startswith(old, i)]

    if len(positions) == 0:
        return new_rows

    new.append(old)
    options = list(itertools.product(new, repeat=len(positions)))
    for replacement_combination in random.sample(
        options, k=min(max_new_mols, len(options))
    ):
        new_drug = list(drug)
        offset = 0
        for pos, replacement in zip(positions, replacement_combination):
            start = pos + offset
            end = start + len(old)
            new_drug[start:end] = replacement
            offset += len(replacement) - len(old)

        new_row = row.copy()
        new_row["Drug"] = "".join(new_drug)
        if new_row.Drug != row.Drug:
            new_rows.append(new_row)

    return new_rows


new_rows = []
for i, row in df_train.iterrows():
    max_new_mols = 20
    new_rows.extend(
        create_molecule(
            row,
            old="(C)",
            new=["(CC)", "(CCC)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(CC)",
            new=["(C)", "(CCC)"],
            max_new_mols=max_new_mols,
        )
    )
    new_rows.extend(
        create_molecule(
            row,
            old="(CCC)",
            new=["(CC)", "(C)"],
            max_new_mols=max_new_mols,
        )
    )
    # new_rows.extend(
    #     create_molecule(
    #         row,
    #         old="(OC)",
    #         new=["(OC(C)C)"],
    #         max_new_mols=max_new_mols,
    #     )
    # )
    # new_rows.extend(
    #     create_molecule(
    #         row,
    #         old="(OCC)",
    #         new=["(OC(C)C)"],
    #         max_new_mols=max_new_mols,
    #     )
    # )
    # new_rows.extend(
    #     create_molecule(
    #         row,
    #         old="(OCCC)",
    #         new=["(OC(C)C)"],
    #         max_new_mols=max_new_mols,
    #     )
    # )

In [5]:
df_train_extended = pd.concat([df_train, pd.DataFrame(new_rows)], axis=0)
df_train_extended.drop_duplicates(subset=["Drug"], keep="first", inplace=True)
df_train_extended.reset_index(drop=True, inplace=True)
df_train_extended["Drug_ID"] = df_train_extended.index

df_train_extended

Unnamed: 0,Drug_ID,Drug,Y,property
0,0,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1,1
1,1,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0,1
2,2,[N-]=[N+]=CC(=O)NCC(=O)NN,1,1
3,3,[N-]=[N+]=C1C=NC(=O)NC1=O,1,1
4,4,CCCCN(CC(O)C1=CC(=[N+]=[N-])C(=O)C=C1)N=O,1,1
...,...,...,...,...
19761,19761,CCCCCCOc1nsnc1C1=CCCN(CC)C1,1,3
19762,19762,CC1=C2C(c3cccc(Cl)c3)=NCCN=C2N(CC)N1,1,3
19763,19763,CC1=C2C(c3cccc(Cl)c3)=NCCN=C2N(CCC)N1,1,3
19764,19764,CCc1c(CC)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(O...,1,3


In [6]:
mols = dm.from_df(df_train_extended, smiles_column="Drug")



In [7]:
rdkit = FPVecTransformer("desc2D", n_jobs=8, dtype=np.float32, replace_nan=True)
maccs = FPVecTransformer("maccs", dtype=np.float32)
ecfp4 = FPVecTransformer("ecfp:4", dtype=np.float32)

# Wrap the calculator in a transformer instance
featurizer = FeatConcat([maccs, ecfp4, rdkit], dtype=np.float32)

with dm.without_rdkit_log():
    feats = featurizer(mols)

  from .autonotebook import tqdm as notebook_tqdm
[23:17:44] please use MorganGenerator
please use MorganGenerator


please use MorganGenerator
[23:17:44] please use MorganGenerator
please use MorganGenerator



please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator


please use MorganGenerator


please use MorganGenerator
[23:17:47] please use MorganGenerator

please use MorganGenerator





please use MorganGenerator

please use MorganGenerator
[23:17:48] please use MorganGenerator

please use MorganGenerator

please use MorganGenerator[23:17:48] 
please use MorganGenerator




please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator



please use MorganGenerator
please use MorganGenerator



[23:17:49] please use MorganGenerator
please use MorganGenerator
[23:17:49] ple

In [8]:
df_train_extended.reset_index(drop=True, inplace=True)
df_train_featurized = pd.concat([df_train_extended, pd.DataFrame(feats)], axis=1)

In [9]:
df_train_featurized

Unnamed: 0,Drug_ID,Drug,Y,property,0,1,2,3,4,5,...,2373,2374,2375,2376,2377,2378,2379,2380,2381,2382
0,0,O=[N+]([O-])c1c2c(c3ccc4cccc5ccc1c3c45)CCCC2,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
1,1,O=c1c2ccccc2c(=O)c2c1ccc1c2[nH]c2c3c(=O)c4cccc...,0,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,[N-]=[N+]=CC(=O)NCC(=O)NN,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,6.0
3,3,[N-]=[N+]=C1C=NC(=O)NC1=O,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,2.0
4,4,CCCCN(CC(O)C1=CC(=[N+]=[N-])C(=O)C=C1)N=O,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19761,19761,CCCCCCOc1nsnc1C1=CCCN(CC)C1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19762,19762,CC1=C2C(c3cccc(Cl)c3)=NCCN=C2N(CC)N1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19763,19763,CC1=C2C(c3cccc(Cl)c3)=NCCN=C2N(CCC)N1,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19764,19764,CCc1c(CC)[n+]([NH-])c(-c2ccc(OC)c(OC)c2)c2cc(O...,1,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
prop_encoding = pd.get_dummies(df_train_featurized.property).astype(np.float32)
df_train_featurized = pd.concat([df_train_featurized, prop_encoding], axis=1)

In [12]:
feature_cols = df_train_featurized.iloc[:, 4:].columns

In [13]:

model = RandomForestClassifier(
    criterion="entropy", n_estimators=512, class_weight="balanced", n_jobs=8
)

x_train, y_train = df_train_featurized[feature_cols], df_train_featurized.Y
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
model.fit(x_train, y_train)
y_pred = model.predict_proba(x_train)[:, 1]
auc = roc_auc_score(y_train, y_pred)
print(f"Overall: {auc:.4f}")

Overall: 1.0000


In [14]:
df_test = pd.read_csv("data/test_data.csv", index_col=0)

mols_test = dm.from_df(df_test, smiles_column="Drug")

with dm.without_rdkit_log():
    feats_test = featurizer(mols_test)

df_test.reset_index(drop=True, inplace=True)
df_test_featurized = pd.concat([df_test, pd.DataFrame(feats_test)], axis=1)

  min_charge, max_charge = np.nanmin(atomic_charges), np.nanmax(atomic_charges)

please use MorganGenerator
please use MorganGenerator

please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator



please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator


please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator
please use MorganGenerator

please use MorganGenerator
please use MorganGenerator
please use MorganGenerator




please use MorganGenerator
please use MorganGenerator
please use 

In [15]:
prop_encoding = pd.get_dummies(df_test_featurized.property).astype(np.float32)
df_test_featurized = pd.concat([df_test_featurized, prop_encoding], axis=1)

In [17]:
x = scaler.transform(df_test_featurized[feature_cols])
preds = model.predict_proba(x)[:, 1]
preds

array([0.90820312, 0.98046875, 0.953125  , ..., 0.95507812, 0.796875  ,
       0.77929688])

In [19]:
submission = pd.read_csv("data/sample.csv")
submission["Y"] = preds
submission.to_csv("submissions/gavno.csv", index=False)
submission

Unnamed: 0,id,Y
0,0,0.908203
1,1,0.980469
2,2,0.953125
3,3,0.927734
4,4,0.150391
...,...,...
1216,1216,0.039062
1217,1217,0.867188
1218,1218,0.955078
1219,1219,0.796875
