In [1]:
%cd ..

/home/nikita/edu/competitions/admet


In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem, RDLogger
from rdkit.Chem import Descriptors
from tqdm import tqdm
import warnings
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

df_train = pd.read_csv("data/extended_train.csv", index_col=0)
df_test = pd.read_csv("data/test_data.csv", index_col=0)
sample = pd.read_csv("data/sample.csv")

In [3]:
RDLogger.DisableLog("rdApp.*")


def get_decsriptors_df(smiles_list):
    descriptors_list = []

    for smiles in tqdm(smiles_list):
        descriptors_list.append(
            Descriptors.CalcMolDescriptors(Chem.MolFromSmiles(smiles), 0)
        )
    return pd.DataFrame(descriptors_list).fillna(0)

In [4]:
x_train = get_decsriptors_df(df_train["Drug"])
x_train["property"] = df_train["property"]
y_train = df_train[["Y"]]
y_train["property"] = df_train["property"]
x_test = get_decsriptors_df(df_test["Drug"])
x_test["property"] = df_test["property"]

100%|██████████| 12439/12439 [01:57<00:00, 105.97it/s]
100%|██████████| 1221/1221 [00:09<00:00, 124.20it/s]


In [5]:
cat_cols = []
for col in x_train.select_dtypes("int64").columns:
    if x_train[col].nunique() <= 10:
        cat_cols.append(col)

cat_cols.remove("property")

In [6]:
properties = x_train.property.unique()
x_trains = []
y_trains = []
x_tests = []

for prop in properties:
    x_trains.append(x_train[x_train["property"] == prop].drop("property", axis=1))
    y_trains.append(y_train[y_train["property"] == prop].drop("property", axis=1))
    x_tests.append(x_test[x_test["property"] == prop].drop("property", axis=1))

In [7]:
models = []
for i, (sub_x_train, sub_y_train) in enumerate(zip(x_trains, y_trains)):
    print(f"### PROPERTY {i}")
    model = CatBoostClassifier(iterations=2000, use_best_model=True, eval_metric="AUC")
    sub_x_train, sub_x_val, sub_y_train, sub_y_val = train_test_split(
        sub_x_train, sub_y_train, test_size=0.2, random_state=0
    )
    model.fit(
        sub_x_train, sub_y_train, cat_features=cat_cols, eval_set=(sub_x_val, sub_y_val)
    )
    models.append(model)

### PROPERTY 0
Learning rate set to 0.036809
0:	test: 0.7065057	best: 0.7065057 (0)	total: 95.4ms	remaining: 3m 10s
1:	test: 0.7347603	best: 0.7347603 (1)	total: 141ms	remaining: 2m 21s
2:	test: 0.7396922	best: 0.7396922 (2)	total: 188ms	remaining: 2m 5s
3:	test: 0.7510290	best: 0.7510290 (3)	total: 235ms	remaining: 1m 57s
4:	test: 0.7546556	best: 0.7546556 (4)	total: 280ms	remaining: 1m 51s
5:	test: 0.7549759	best: 0.7549759 (5)	total: 328ms	remaining: 1m 48s
6:	test: 0.7565369	best: 0.7565369 (6)	total: 371ms	remaining: 1m 45s
7:	test: 0.7595916	best: 0.7595916 (7)	total: 418ms	remaining: 1m 43s
8:	test: 0.7624315	best: 0.7624315 (8)	total: 459ms	remaining: 1m 41s
9:	test: 0.7644273	best: 0.7644273 (9)	total: 504ms	remaining: 1m 40s
10:	test: 0.7633617	best: 0.7644273 (9)	total: 547ms	remaining: 1m 38s
11:	test: 0.7659526	best: 0.7659526 (11)	total: 599ms	remaining: 1m 39s
12:	test: 0.7647476	best: 0.7659526 (11)	total: 641ms	remaining: 1m 38s
13:	test: 0.7654148	best: 0.7659526 (11)

In [8]:
submission = sample.copy()
for (sub_x_test, model) in zip(x_tests, models):
    preds = model.predict_proba(sub_x_test)
    ids = sub_x_test.index
    submission["Y"][ids] = preds[:, 1]
submission.to_csv("submissions/extended.csv", index=False)